{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 原始数据分析"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练数据中类别分布\n",
    "question_topic_train_set.txt \n",
    "\n",
    "问题与话题标签的绑定关系。一共有两列，各个列之间用 \\t 分割。注意，如果一个问题绑定了多个话题标签，这些标签是无序的。格式如下：\n",
    "\n",
    "question_id topic_id1,topic_id2...topic_idn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt \n",
    "import pickle \n",
    "from tqdm import tqdm\n",
    "from __future__ import division\n",
    "import time\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_id</th>\n",
       "      <th>topic_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6555699376639805223</td>\n",
       "      <td>7739004195693774975,3738968195649774859</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2887834264226772863</td>\n",
       "      <td>-3149765934180654494</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           question_id                                topic_ids\n",
       "0  6555699376639805223  7739004195693774975,3738968195649774859\n",
       "1  2887834264226772863                     -3149765934180654494"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_qtopic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\\t', names=['question_id', 'topic_ids'], \n",
    "                        dtype={'question_id': object, 'topic_ids': object})  # question_topic\n",
    "df_qtopic.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练数据中一共有2999967个问题\n",
      "max_num=19, min_num=1, mean_num=2\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_id</th>\n",
       "      <th>topic_ids</th>\n",
       "      <th>topic_num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6555699376639805223</td>\n",
       "      <td>[7739004195693774975, 3738968195649774859]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2887834264226772863</td>\n",
       "      <td>[-3149765934180654494]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           question_id                                   topic_ids  topic_num\n",
       "0  6555699376639805223  [7739004195693774975, 3738968195649774859]          2\n",
       "1  2887834264226772863                      [-3149765934180654494]          1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print '训练数据中一共有%d个问题' % len(df_qtopic)\n",
    "df_qtopic['topic_ids'] = df_qtopic.topic_ids.apply(lambda ids: ids.split(','))\n",
    "df_qtopic['topic_num'] = df_qtopic.topic_ids.apply(lambda ids: len(ids))\n",
    "max_ids_num = max(df_qtopic.topic_num.values)\n",
    "min_ids_num = min(df_qtopic.topic_num.values)\n",
    "mean_ids_num = np.mean(df_qtopic.topic_num.values)\n",
    "print 'max_num=%d, min_num=%d, mean_num=%d' % (max_ids_num, min_ids_num, mean_ids_num)\n",
    "df_qtopic.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 标签中问题的顺序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "questions1 = df_qtopic.question_id.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 每个问题的标签数量分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fb13fcd3690>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAELCAYAAAD6AKALAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl4VdXVwOHfSgIoCkKUYEgCEYlkYIgQBa2kMkQQLZZB\nFLGGScG2KlVQrG3V1iFqLfBZHEAQVAShokGkCIIKWKYEwjxZQQikgMoYICSwvj/O4ZpAQm7CTW5C\n1vs89+Hefc+wdoQsz9lnry2qijHGGOMrAf4OwBhjzIXFEosxxhifssRijDHGpyyxGGOM8SlLLMYY\nY3zKEosxxhifssRijDHGpyyxGGOM8SlLLMYYY3wqyN8BeOuKK67QyMhIf4dhjDGVSnp6+g+qWq88\nz1lpEktkZCRpaWn+DsMYYyoVEfm+vM9pt8KMMcb4lCUWY4wxPmWJxRhjjE9ZYjHGGONTlliMMcb4\nlCUWY4wxPmWJxRhjjE9ZYjHGGONTlWaC5NpdB4kc8Zm/wzDGmHK1PeU2Ro8ezbhx41BV7r//foYO\nHcrq1asZMmQIR44cITIyksmTJ1O7dm1OnDjB4MGDSUtLIyAgAKDW6WOJyF3AU0AgMEtVn3DbGwKT\ngDrudyNUdXa+/WoDG4BPVPX3xcXstysWEZkgIntFZJ2/YjDGmIpu3bp1jBs3juXLl7N69WpmzZrF\nt99+y6BBg0hJSWHt2rV0796dV155BYBx48YBsHbtWubNmwcQLiIBInI58ArQUVXjgCtFpKN7mj8B\n01T1WuBu4PUzwvgbsNDbmP15K2wi0MWP5zfGmApv48aNtGnThpo1axIUFMQvf/lLZsyYwZYtW0hM\nTAQgKSmJjz76CIANGzbQoUMHAEJCQgBOAglAY2Crqu5zD/0F0NN9r0Bt9/1lwO7T5xeR1kB9YK63\nMfstsajqQuAnf53fGGMqg2bNmrFo0SJ+/PFHjh49yuzZs9m5cydxcXGkpqYCMH36dHbu3AlAy5Yt\nmTlzJnl5eWzbtg2gJhABfAs0FZFIEQkCfu22AzwD3CsimcBs4CEAEQkAXgWGlSRmG7w3xpgKLCYm\nhieeeIJbbrmFLl26EB8fT2BgIBMmTOD111+ndevWHD58mOrVqwMwYMAAwsPDSUhIYOjQoQBHgJOq\nuh94EPgQWARsx7maAegDTFTVcKAr8J6bVH4LzFbVzJLEXKEH70XkAeABgMDa5Vr12RhjKoyBAwcy\ncOBAAP74xz8SHh5OdHQ0c+c6d6e2bNnCZ585DzcFBQUxcuRIz77u1ckWAFX9FPjUbX+AnxPLQNyh\nCVVdIiIXAVcANwDtROS3wKVAdRE5oqojzhVvhb5iUdWxqpqgqgmBNS/zdzjGGOMXe/fuBWDHjh3M\nmDGDe+65x9N26tQpnnvuOYYMGQLA0aNHyc7OBjg9eK+qugFARELcP+viXI287Z5iB9DR/S4GuAjY\np6p9VbWhqkbi3A57t7ikAhX8isUYYwz07NmTH3/8kWrVqjFmzBjq1KnD6NGjGTNmDAA9evSgf//+\ngJOEOnfuTEBAAGFhYQDb8h1qtIi0dN//VVW3uO8fA8aJyB9wBvL7qaqWNl45j33Pi4hMAW7Gudza\nAzytquOL2r5GaJSGJo8qp+iMMaZi2J5y23ntLyLpqprgo3C8O6e/EktJJSQkqK0gaYwxJeOPxFKh\nx1iMMcZUPpVmjMVKuhhzYdmechubN2/mrrvu8rR99913/PWvf2XJkiVs3rwZgAMHDlCnTh0yMjIA\nePHFFxk/fjyBgYH83//9H507dwagS5cuZGVlkZeXR7t27RgzZgyBgYEsXLiQoUOHsmbNGqZOnUqv\nXr0855s0aRLPPfccAH/6059ITk4ur+5f0PySWEQkAngXZzanAmNVdbQ/YjHG+E/Tpk09CePkyZOE\nhYXRvXv30/MvAHjssce47DLnqdANGzYwdepU1q9fz+7du+nUqRNbtmwhMDCQadOmUbt2bVSVXr16\nMX36dO6++24aNmzIxIkT+fvf/17g3D/99BPPPvssaWlpiAitW7emW7du1K1bt/x+ABcof90KywMe\nU9VYoC3wOxGJ9VMsxpgKYP78+Vx99dU0atTI06aqTJs2jT59+gCQmprK3XffTY0aNbjqqqto0qQJ\ny5cvB6B2baciSV5eHidOnEBEAIiMjKRFixanCzJ6fP755yQlJREcHEzdunVJSkpizpw55dHVC55f\nEouqZqnqSvf9YWAjEOaPWIwxFcPUqVM9CeS0RYsWUb9+faKiogDYtWsXERERnu/Dw8PZtWuX53Pn\nzp0JCQmhVq1aBW55Faa4Y5nS8/vgvYhEAtcCy/wbiTHGX06cOMHMmTO58847C7RPmTLlrGRzLp9/\n/jlZWVnk5OSwYMECX4dpvOTXxCIilwIfAUNV9VAh3z8gImkiknby6MHyD9AYUy7+/e9/06pVK+rX\nr+9py8vLY8aMGQUG98PCwjzFFgEyMzNPTwL0uOiii7jjjjs8BRqL4s2xTOn4cz2WajhJZbKqzihs\nGyvpYkzVUNiVyRdffEF0dDTh4eGetm7dujF16lRycnLYtm0bW7du5frrr+fIkSNkZWUBTkL67LPP\niI6OPuc5O3fuzNy5c9m/fz/79+9n7ty5nifMzPnx11NhAowHNqrqP/wRgzGmYsjOzmbevHm89dZb\nBdoLG3OJi4ujd+/exMbGEhQU5HmkODs7m27dupGTk8OpU6do3769p3bWihUr6N69O/v37+fTTz/l\n6aefZv369QQHB/PnP/+Z6667DoC//OUvBAcHl0+nL3B+mXkvIjfhlG1eC5xym/+YfynMM1lJF2Mu\nLOdbqsR4xx8z7/1yxaKqiwEpyT7Nwy4jzf4iGmNMhef3p8KMMcZcWCyxGGOM8Sl/Dd5fBCwEargx\n/EtVnz7XPlYrzBjvbU+5jQMHDjBo0CDWrVuHiDBhwgRmz55NamoqAQEBhISEMHHiRBo0aEBubi6D\nBg1i5cqV5OXlcd999/Hkk08CzhNbL7zwAiJCgwYNeP/997niiiv4xz/+wdtvv01QUBD16tVjwoQJ\nnlnzXbp0YenSpdx0003MmjXLnz8K4wf+umLJATqoaksgHugiIm39FIsxF6RHHnmELl26sGnTJlav\nXk1MTAzDhw9nzZo1ZGRkcPvtt/PXv/4VgOnTp5OTk8PatWtJT0/nrbfeYvv27eTl5fHII4/w5Zdf\nsmbNGlq0aME///lPAK699lrS0tJYs2YNvXr14vHHH/ece/jw4bz33nt+6bfxP3+VdFFVPeJ+rOa+\nKsfCMMZUAgcPHmThwoWeddKrV69OnTp1PPW0wHnM93Q9LREhOzubvLw8jh07RvXq1T0FHVWV7Oxs\nVJVDhw7RoEEDANq3b0/NmjUBaNu2LZmZmZ5jd+zYkVq1apVXd00F488JkoEikgHsBeapqpV0McZH\ntm3bRr169ejfvz/XXnstgwYN8qyD/tRTTxEREcHkyZM9Vyy9evXikksuITQ0lIYNGzJs2DCCg4Op\nVq0ab7zxBs2bN6dBgwZs2LDBk6zyGz9+PLfeemu59tFUXH5LLKp6UlXjgXDgehFpduY2VtLFmNLJ\ny8tj5cqVPPjgg6xatYpLLrmElJQUAJ5//nl27txJ3759Pbe1li9fTmBgILt372bbtm28+uqrfPfd\nd+Tm5vLGG2+watUqdu/eTYsWLXjxxRcLnOv9998nLS2N4cOHl3s/TcXk96fCVPUA8CXQpZDvrKSL\nMaUQHh5OeHg4bdq0AZwrkpUrVxbYpm/fvnz00UcAfPDBB3Tp0oVq1aoREhLCL37xC9LS0jxrpVx9\n9dWICL179+Y///mP5xhffPEFzz//PDNnzqRGjRrl1DtT0fklsYhIPRGp476/GEgCNvkjFmMuRFde\neSURERGeVRjnz59PbGwsW7du9WyTmprqqafVsGFDTzXg7Oxsli5dSnR0NGFhYWzYsIF9+/YBMG/e\nPGJiYgBYtWoVgwcPZubMmYSEhJRn90wF56+liUOBSSISiJPcpqmqPZNojA+99tpr9O3blxMnTtC4\ncWPeeecdBg0axObNmwkICKBRo0a8+eabAPzud7+jf//+xMXFoar079+fFi1aAPD000+TmJhItWrV\naNSoERMnTgScJ7+OHDniKXXfsGFDZs6cCUC7du3YtGkTR44cITw8nPHjx1uBxyrEL7XCSiMhIUHT\n0tL8HYYxxlQq/qgV5vcxFmOMMRcWSyzGGGN8yl9jLIAzlwVIA3ap6u3n2tZKulRN21Nu4/jx4yQm\nJpKTk0NeXh69evXi2WefZfXq1QwZMoQjR44QGRnJ5MmTPRMA16xZw+DBgzl06BABAQGsWLGCiy66\niA8//JDnn3+ekydPcvvtt/PSSy8BkJOTw3333Ud6ejqXX345H374IZGRkXz//fd0796dU6dOkZub\ny0MPPeRZ58MYUzh/X7E8Amz0cwymgqtRowYLFixg9erVZGRkMGfOHJYuXcqgQYNISUlh7dq1dO/e\nnVdeeQVw5nDce++9vPnmm6xfv56vvvqKatWq8eOPPzJ8+HDmz5/P+vXr+d///sf8+fMBZ4Jf3bp1\n+fbbb/nDH/7AE088AUBoaChLliwhIyODZcuWkZKSwu7du/32szCmMvDnzPtw4DbgbX/FYCoHEeHS\nSy8FIDc3l9zcXESELVu2kJiYCEBSUpJnTsbcuXNp0aIFLVu2BODyyy8nMDCQ7777jqioKOrVqwdA\np06dPPukpqaSnJwMOHM+5s+fj6pSvXp1z/yM06sTGmPOzZ9XLKOAx/l5BUljinTy5Eni4+MJCQkh\nKSmJNm3aEBcXR2pqKuAUUdy5cycAW7ZsQUTo3LkzrVq14uWXXwagSZMmbN682VNc8ZNPPvHss2vX\nLiIiIgAICgrisssu48cffwRg586dtGjRgoiICJ544glPrSxjTOH8NUHydmCvqqYXs52VdDEABAYG\nkpGRQWZmJsuXL2fdunVMmDCB119/ndatW3P48GGqV68OOLfCFi9ezOTJk1m8eDEff/wx8+fPp27d\nurzxxhvcddddtGvXjsjISAIDA4s9d0REBGvWrOHbb79l0qRJ7Nmzp6y7a0yl5q8rll8A3URkOzAV\n6CAi75+5kZV0MWeqU6cO7du3Z86cOURHRzN37lzS09Pp06cPV199NeCUM0lMTOSKK66gZs2adO3a\n1VPO5Fe/+hXLli1jyZIlNG3alGuuuQaAsLAwz9VLXl4eBw8e5PLLLy9w7gYNGtCsWTMWLVpUjj02\npvLxV9n8J1U1XFUjgbuBBap6rz9iMRXfvn37OHDgAADHjh1j3rx5REdHs3fvXgBOnTrFc88953la\nq3Pnzqxdu5ajR4+Sl5fH119/TWxsLIBnn/379/P6668zaNAgALp168akSZMA+Ne//kWHDh0QETIz\nMzl27Jhnn8WLF9O0adPy67wxlZBfHzc2xhtZWVkkJydz8uRJTp06Re/evbn99tsZPXo0Y8aMAaBH\njx70798fgLp16/Loo49y3XXXISJ07dqV2267DXAWv1q9ejUAf/nLXzxXLAMHDuQ3v/kNTZo0ITg4\nmKlTpwKwceNGHnvsMUQEVWXYsGE0b968vH8ExlQqlaakS43QKA1NHuXvMEw5255ym79DMKZS80dJ\nl0pzxdI87DLS7JeMMcZUeP6eIGmMMeYC47crFveJsMPASSCvuEu1qlbSZXvKbQwYMIBZs2YREhLC\nunXrAMjIyGDIkCEcP36coKAgXn/9da6//npOnDjB4MGDSUtLIyAggNGjR3PzzTcDcPPNN5OVlcXF\nF18MOBMIQ0JC2LFjB8nJyRw4cICTJ0+SkpJC165dAefx3tNjCfnLoRtjTHH8fSusvar+4OcYKqx+\n/frx+9//nvvuu8/T9vjjj/P0009z6623Mnv2bB5//HG++uorxo0bB8DatWvZu3cvt956KytWrCAg\nwLkonTx5MgkJBXP3c889R+/evXnwwQfZsGEDXbt2Zfv27QBcfPHFntUDjTGmJOxWWAWWmJhIcHBw\ngTYR4dChQwAcPHjQMwt8w4YNdOjQAYCQkBDq1KlDcevXFHUsY4w5H/5MLAp8ISLpIvKAH+OoVEaN\nGsXw4cOJiIhg2LBhvPjiiwC0bNmSmTNnkpeXx7Zt20hPT/dM+ANITk4mPj6ev/3tb5x+EvCZZ57h\n/fffJzw8nK5du/Laa695tj9+/DitWrWibdu2fPLJJ+XbSWNMpebPxHKTqsYDtwK/E5HEMzewki5n\ne+ONNxg5ciQ7d+5k5MiRDBw4EIABAwYQHh5OQkICQ4cO5cYbb/SUK5k8eTLr169n0aJFLFq0iPfe\new+AKVOm0K9fPzIzM5k9eza/+c1vPEUWv//+e1auXMkHH3zA0KFD+e9//+ufDhtjKh2/JRZV3eX+\nuRf4GLi+kG2spMsZJk2aRI8ePQC48847Wb58OeAUThw5ciQZGRmkpqZy4MCBAuVKAGrVqsU999zj\n2Wf8+PH07t0bgBtuuIHjx4/zww8/FNincePG3Hzzzaxatar8OmmMqdT8VYTyEhGpdfo9cAuwzh+x\nVDYNGjTg66+/BmDBggVERUUBcPToUbKzswGYN28eQUFBxMbGkpeX50kWubm5zJo1i2bNmgHO016n\n1yPZuHEjx48fp169euzfv5+cnBwAfvjhB7755htPSRRjjCmOv54Kqw98LCKnY/hAVef4KZYKq0+f\nPnz11Vf88MMPhIeH8+yzzzJu3DgeeeQR8vLyuOiiixg7dizg1MDq3LkzAQEBhIWFeW535eTk0Llz\nZ3Jzczl58iSdOnXi/vvvB+DVV1/l/vvvZ+TIkYgIEydORETYuHEjgwcPJiAggFOnTjFixAhLLMYY\nr1lJlwrKSpkYY3zBSrqcg5V0McaYysHmsRhjjPGpSpNY1u46yM6dO2nfvj2xsbHExcUxevRoAP78\n5z/TokUL4uPjueWWW9i9e7dnvxdffJEmTZrQtGlTPv/8c097eno6zZs3p0mTJjz88MNUlluCxhhT\n4amqX15AHeBfwCZgI3DDubavfmUT3b17t6anp6uq6qFDhzQqKkrXr1+vBw8e1NNGjx6tgwcPVlXV\n9evXa4sWLfT48eP63XffaePGjTUvL09VVa+77jpdsmSJnjp1Srt06aKzZ89WY4y50ABpWs6/3/15\nxTIamKOq0UBLN7mcU2hoKK1atQKcORkxMTHs2rWL2rVre7bJzs7GfdqM1NRU7r77bmrUqMFVV11F\nkyZNWL58OVlZWRw6dIi2bdsiItx33302u9wYY3zEL4P3InIZkAj0A1DVE8CJkhxj+/btrFq1ijZt\n2gDw1FNP8e6773LZZZfx5ZdfArBr1y7atm3r2Sc8PJxdu3ZRrVo1wsPDz2o3xhhz/vx1xXIVsA94\nR0RWicjb7kTJAooq6XLkyBF69uzJqFGjPFcrzz//PDt37qRv377885//LK9+GGOMOYO/EksQ0Ap4\nQ1WvBbKBEWdupIWUdMnNzaVnz5707dvXU9okv759+/LRRx8BTlmS/IUYMzMzCQsLIywsjMzMzLPa\njTHGnD9/JZZMIFNVl7mf/4WTaM5JVRk4cCAxMTE8+uijnvatW7d63qemphIdHQ1At27dmDp1Kjk5\nOWzbto2tW7dy/fXXExoaSu3atVm6dCmqyrvvvssdd9zhy/4ZY0yV5ZcxFlX9n4jsFJGmqroZ6Ahs\nKG6/b775hvfee4/mzZsTHx8PwAsvvMD48ePZvHkzAQEBNGrUiDfffBOAuLg4evfuTWxsLEFBQYwZ\nM8ZT8ff111+nX79+HDt2jFtvvZVbb721zPprjDFVid9KuohIPPA2UB34DuivqvuL2r5GaJTmZG0t\n6mtjjDGFqFIlXVQ1A/C6s83DrGy+McZUBpVm5r0xxpjKodjEIiIBItK7PIIpzoABAwgJCfGsJwLw\n008/kZSURFRUFElJSezf79xNO3HiBP3796d58+a0bNmSr776CnDWLbntttuIjo4mLi6OESPOehjN\nGGPMeSg2sajqKeBxX55URJqKSEa+1yERGVrcfv369WPOnILLtqSkpNCxY0e2bt1Kx44dSUlJAWDc\nuHEArF27lnnz5vHYY495lt0dNmwYmzZtYtWqVXzzzTf8+9//9mX3jDGmSvP2VtgXIjJMRCJEJPj0\nq7QnVdXNqhqvzpr3rYGjOMsTn1NiYiLBwQVPm5qaSnJyMgDJycme0iwbNmygQ4cOAISEhFCnTh3S\n0tKoWbMm7du3B6B69eq0atWqwJwWY4wx58fbxHIX8DtgIZDuvtJ8FENH4L+q+n1pdt6zZw+hoaEA\nXHnllezZsweAli1bMnPmTPLy8ti2bRvp6ekFJksCHDhwgE8//ZSOHTueXw+MMcZ4ePVUmKpeVYYx\n3A1MKewLEXkAeACc9dmLIyKeApQDBgxg48aNJCQk0KhRI2688UbPHBaAvLw8+vTpw8MPP0zjxo19\n0A1jjDHgZWIRkZrAo0BDVX1ARKKApqo663xOLiLVgW7Ak4V9r6pjgbEACQkJhU64qV+/PllZWYSG\nhpKVlUVISAgAQUFBjBw50rPdjTfeyDXXXOP5/MADDxAVFcXQocUO7RhjjCkBb2+FvYNTffhG9/Mu\n4DkfnP9WYKWq7intAbp168akSZMAmDRpkqc0y9GjR8nOzgZg3rx5BAUFERsbC8Cf/vQnDh48yKhR\no84zfGOMMWfyaua9iKSpaoKIrHKLRiIiq1W15XmdXGQq8LmqvlPctgkJCRoVFcVXX33FDz/8QP36\n9Xn22Wf59a9/Te/evdmxYweNGjVi2rRpBAcHs337djp37kxAQABhYWGMHz+eRo0akZmZSUREBNHR\n0dSoUQOA3//+9wwaNOh8umKMMRWSP2bee5tY/oMzyP6NqrYSkauBKap6falP7JTJ3wE0VtWDxW2f\nkJCgaWm+el7AGGOqhopc0uVpYA4QISKTgV/gLtJVWqqaDVx+PscwxhhT8Xj7VNg8EVkJtAUEeERV\nfyjTyIwxxlRKJSlC+UvgJkCBangxobEsREZGUqtWLQIDAwkKCiItLY3hw4fz6aefUr16da6++mre\neecd6tSpA8CaNWsYPHgwhw4dIiAggBUrVnDRRRf5I3RjjKkSvHoqTEReB4YAa4F1wGARGXM+JxaR\nP4jIehFZJyJTRMTr3/ZffvklGRkZnB5zSUpKYt26daxZs4ZrrrmGF198EXDmqtx77728+eabrF+/\nnq+++opq1aqdT9jGGGOK4e3jxh2Azqr6jvsEV1e3rVREJAx4GEhQ1WZAIM5EyVK55ZZbCApyLr7a\ntm3rKdEyd+5cWrRoQcuWzsNrl19+eYFJksYYY3zP28TyLZB/6nuE23Y+goCLRSQIqAns9mYnEaFT\np060bt2asWPHnvX9hAkTPKtBbtmyBRGhc+fOtGrVipdffvk8QzbGGFOcc46xiMinOGMqtYCNIrLc\n/dwGWF7ak6rqLhH5O87jxseAuao615t9Fy9eTFhYGHv37iUpKYno6GgSExMBeP755wkKCqJv376A\ncyts8eLFrFixgpo1a9KxY0dat25ttcGMMaYMFTd4//eyOKmI1AXuAK4CDgDTReReVX3/jO3OqhUW\nFhYGOBWLu3fvzvLly0lMTGTixInMmjWL+fPne+qFhYeHk5iYyBVXXAFA165dWblypSUWY4wpQ+e8\nFaaqX+d/AatwBvBPv0qrE7BNVfepai4wg5/LxeQ//1hVTVDVhHr16pGdnc3hw4cByM7OZu7cuTRr\n1ow5c+bw8ssvM3PmTGrWrOnZv3Pnzqxdu5ajR4+Sl5fH119/7SnrYowxpmx4W4TyAeCvwHHgFM5c\nFgVKWxZ4B9DWLW55DGdWf7HT6vfs2UP37t0B5zbXPffcQ5cuXWjSpAk5OTkkJSUBzgD+m2++Sd26\ndXn00Ue57rrrEBG6du3KbbfdVsqQjTHGeMPbki5bgRt8OSlSRJ7FWeclD+dKaJCq5hS1vZV0McaY\nkqvIJV3+i7PKo8+o6tM4pWKMMcZcQLxNLE8C/xGRZYDnqkJVHy6TqIwxxlRa3s5jeQtYACzl56WJ\n08sqqKLs3LmT9u3bExsbS1xcHKNHjwZg+vTpxMXFERAQQP7bZbm5uSQnJ9O8eXNiYmI8M/KNMcaU\nHW+vWKqp6qO+PLGIPALcj/MgwDhVLXbVraCgIF599VVatWrF4cOHad26NUlJSTRr1owZM2YwePDg\nAttPnz6dnJwcz5NhsbGx9OnTh8jISF92xRhjTD7eJpZ/u0+GfUrBW2E/leakItIMJ6lcj7My5RwR\nmaWq55zNHxoaSmhoKAC1atUiJiaGXbt2eZ4GK+Q8ZGdnk5eXx7Fjx6hevTq1a9cuTcjGGGO85O2t\nsD644yz8fBvsfB7RigGWqepRVc0DvgZ6lOQA27dvZ9WqVbRp06bIbXr16sUll1xCaGgoDRs2ZNiw\nYQQHB59H2MYYY4rj7XosV/n4vOuA50Xkcpx5LF0pQaI6cuQIPXv2ZNSoUee8Alm+fDmBgYHs3r2b\n/fv3065dOzp16kTjxqWdfmOMMaY43k6QvK+wdlV9tzQnVdWNIvISMBfIBjKAk4Wc96ySLrm5ufTs\n2ZO+ffvSo8e5L3I++OADunTpQrVq1QgJCeEXv/gFaWlplliMMaYMeXsr7Lp8r3bAM0C38zmxqo5X\n1daqmgjsB7YUsk2Bki6qysCBA4mJieHRR4t/lqBhw4YsWLAAcErALF26lOjo6PMJ2xhjTDG8mnl/\n1k4idYCpqtql1CcWCVHVvSLSEOfKpa2qHihq+4SEBB01ahTt2rWjefPmBAQ4OfGFF14gJyeHhx56\niH379lGnTh3i4+P5/PPPOXLkCP3792fDhg2oKv3792f48OGlDdkYYyodf8y8L21iqQasU9WmpT6x\nyCLgciAXeFRV559reyvpYowxJVdhS7rkW5cFnNtnscC08zmxqrY7n/2NMcZUTN7OY8m/Lkse8L2q\nZpZBPMYYYyo5rwbvz1iX5Rt/JZUBAwYQEhJCs2bNPG0//fQTSUlJREVFkZSUxP79+wGYPHky8fHx\nnldAQAAYaL9yAAAWv0lEQVQZGRn+CNsYY6oUrxKLiPQQka0iclBEDonIYRE55MV+E0Rkr4isy9cW\nLCLz3OPNc1eT9Eq/fv2YM2dOgbaUlBQ6duzI1q1b6dixIykpKQD07duXjIwMMjIyeO+997jqqquI\nj4/39lTGGGNKydvHjV8GuqnqZapaW1Vrqao3tVEmAmc+OTYCmK+qUcB897NXEhMTz5o5n5qaSnJy\nMgDJycl88sknZ+03ZcoU7r77bm9PY4wx5jx4m1j2qOrGkh5cVRcCZ9YTuwOY5L6fBPy6pMctENie\nPZ76YVdeeSV79uw5a5sPP/yQPn36nM9pjDHGeMnbwfs0EfkQ+ISCRShnlOKc9VU1y33/P6B+KY5R\nKBFBRAq0LVu2jJo1axYYlzHGGFN2vE0stXFWkLwlX5sCpUksPx9AVUWkyIk0hZV0OVP9+vXJysoi\nNDSUrKwsQkJCCnw/depUu1oxxphy5O1TYf0LeQ04/b2IPFmCc+4RkVB3v1Bg7znOW6CkS2G6devG\npEnOnbVJkyZxxx13eL47deoU06ZNs/EVY4wpR96OsRTnzhJsOxNIdt8nA6ne7tinTx9uuOEGNm/e\nTHh4OOPHj2fEiBHMmzePqKgovvjiC0aM+PlZgIULFxIREWFFJ40xphyVqqTLWQcRWaWq1xbSPgW4\nGbgC2AM8jTNOMw1oCHwP9PZmwTAr6WKMMSVXYUu6eKHQ7KSqRQ1udPTReY0xxlQwvroVJsVvYowx\npirw1RXLdB8dp1iRkZHUqlWLwMBAgoKCSEtLY/Xq1QwZMoQjR44QGRnJ5MmTbW17Y4zxE29Lukxy\n12A5/bmuiEw4/VlVXyhiv8JKutwpIutF5JSIlOq+35dffklGRganx1wGDRpESkoKa9eupXv37rzy\nyiulOawxxhgf8PZWWIv8i3Cp6n7grMH6Qkzk7JIu64AewEIvz12sLVu2kJiYCEBSUhIfffSRrw5t\njDGmhLxNLAH5i0WKSDBe3EYrrKSLqm5U1c0lijIfEaFTp060bt2asWPHAhAXF0dqqvPU8vTp09m5\nc2dpD2+MMeY8eTvG8iqwRESm4wzU9wKeL7OozmHx4sWEhYWxd+9ekpKSiI6OZsKECTz88MP87W9/\no1u3blSvXt0foRljjMH7mffv4ty+2oNT36uHqr5XloGBU9JFRNJEJG3fvn0AhIWFARASEkL37t1Z\nvnw50dHRzJ07l/T0dPr06cPVV19d1qEZY4wpwjkTi4jUdv8MxkkoH7iv/7ltZerMki7Z2dkcPnwY\ngOzsbObOnUuzZs3Yu9epCnPq1Cmee+45hgwZUtahGWOMKUJxt8I+AG4H0nEmQeafr6JAudZK2bNn\nD927dwcgLy+Pe+65hy5dujB69GjGjBkDQI8ePejfv395hmWMMSYfn5R0KfLghZd0+Ql4DagHHAAy\nVLVzcceyki7GGFNyFbqki4j0AG7CuVJZpKpnL9V4hnOUdPnY2/MaY4ypXLydIPk6MARYizMPZYiI\njCnLwIwxxlRO3l6xdABi1L1vJiKTgPVlFpUxxphKy9sJkt/ilLk/LcJtO6ciSrq8IiKbRGSNiHyc\nv1RMcXbu3En79u2JjY0lLi6O0aNHA3DXXXcRHx9PfHw8kZGRxMfHe3tIY4wxPubtFUstYKOILHc/\nXwekichMAFXtVsR+E4F/Au/ma5sHPKmqeSLyEvAk8IRXwQYF8eqrr9KqVSsOHz5M69atSUpK4sMP\nP/Rs89hjj3HZZZd52S1jjDG+5m1i+UtpDq6qC0Uk8oy2ufk+LsWZxe+V0NBQQkNDAahVqxYxMTHs\n2rWL2NjY08dm2rRpLFiwoDThGmOM8QGvEouqfi0i9XGuVACWq2qRa9WXwADgw2K3KsT27dtZtWoV\nbdq08bQtWrSI+vXrExUV5YPQjDHGlIa3T4X1BpbjrG3fG1gmIl5faRRxzKeAPGDyObY5q6QLwJEj\nR+jZsyejRo0qsO7KlClT6NOnqCecjTHGlAdvb4U9BVx3+ipFROoBXwD/Ks1JRaQfzoz+jnqOGZqq\nOhYYC84ESYDc3Fx69uxJ37596dGjh2fbvLw8ZsyYQXp6emlCMsYY4yPeJpaAM259/UgplzUWkS7A\n48AvVfVoSfZVVQYOHEhMTAyPPvpoge+++OILoqOjCQ8PL01YxhhjfMTb5PBvEflcRPq5VxufAbOL\n28kt6bIEaCoimSIyEOcpsVrAPBHJEJE3vQ32m2++4b333mPBggWex4tnz3bCmDp1qt0GM8aYCsCr\nWmHuY8HLcEq6ACwC2qqqV48J+4LVCjPGmJKryLXCktwkMuN0g4g8i5fzT4wxxlQd50wsIvIg8Fug\nsYisyfdVLeCbsgzMGGNM5VTcGMsHwK+Ame6fp1+tVfXe4g5eREmXv7nlXDJEZK6INPA22KJKuvz0\n008kJSURFRVFUlIS+/fv9/aQxhhjfKys12NJBI4A76pqM7ettqoect8/DMSqarFLPiYkJOinn35K\nVlZWgZIun3zyCRMnTiQ4OJgRI0aQkpLC/v37eemll8qsX8YYU1n4Y4ylVI8Me0tVF+Is7JW/7VC+\nj5fgrO/ildDQUFq1agUULOmSmppKcnIyAMnJyXzySbFLxRhjjCkjXi/05Usi8jxwH3AQaF+aY+Qv\n6bJnzx5PDbErr7ySPXv2+C5YY4wxJVKmVyxFUdWnVDUCp5zL74varqQlXdx9EJGyCt0YY0wx/JJY\n8pkM9CzqS1Udq6oJqppQr149oPCSLvXr1ycrKwuArKwsQkJCyj5yY4wxhSr3xCIi+UsP3wFs8nbf\nokq6dOvWjUmTJgEwadIk7rjjDl+Fa4wxpoTK+qmwKcDNwBXAHuBpoCvQFDgFfA8MUdVdxR0rISFB\nR40aRbt27WjevDkBAU5OfOGFF2jTpg29e/dmx44dNGrUiGnTphEcHFxGvTLGmMrDH0+FlWli8SUr\n6WKMMSV3wT1ubIwxpuqxxGKMMcanyjSxFFbSJd93j4mIisgVJTnmgQMH6NWrF9HR0cTExLBkyRLf\nBWyMMea8lfUVy0Sgy5mNIhIB3ALsKOkBH3nkEbp06cKmTZtYvXo1MTEx5x+lMcYYnyn3ki6ukTir\nSJboyYGDBw+ycOFCBg4cCED16tWpU6fOecdpjDHGd/wxj+UOYJeqri7pvtu2baNevXr079+fa6+9\nlkGDBpGdnV0GURpjjCmtck0sIlIT+CPwFy+3L1DSJS8vj5UrV/Lggw+yatUqLrnkElJSUso2aGOM\nMSVS3lcsVwNXAatFZDsQDqwUkSsL2/jMki7h4eGEh4fTpk0bAHr16sXKlSvLK3ZjjDFeKNfEoqpr\nVTVEVSNVNRLIBFqp6v+82f/KK68kIiKCzZs3AzB//nxiY2PLLmBjjDElVqZl8/OXdBGRTOBpVR1/\nPsd87bXX6Nu3LydOnKBx48a88847vgjVGGOMj1hJF2OMuYBZSRdjjDGVniUWY4wxPuWXki4i8pCI\nbBKR9SLyckmOefLkSa699lpuv/123wZrjDHGJ8q9pIuItMdZ4KulqsYBfy/JAUePHm1lXIwxpgLz\nR0mXB4EUVc1xt9nr7fEyMzP57LPPGDRokA+jNMYY40v+GGO5BmgnIstE5GsRuc7bHYcOHcrLL7/s\nWT3SGGNMxeOP39BBQDDQFhgOTBMRKWzD/CVdduzYQUhICK1bty7PWI0xxpRQmc9jEZFIYJaqNnM/\nzwFeUtUv3c//Bdqq6r5zHSc0NFQDAwMJCgri+PHjHDp0iB49evD++++XafzGGFOZVZV5LJ8A7QFE\n5BqgOvBDcTuFhYWRmZnJ9u3bmTp1Kh06dLCkYowxFVC5l3QBJgAT3EeQTwDJWlmm/xtjjCmWlXQx\nxpgLWFW5FWaMMeYCZonFGGOMT1WqxHL8+HGuv/56WrZsSVxcHE8//bS/QzLGGHOGcq8VJiLPiMgu\nEclwX129PV6NGjVYsGABq1evJiMjgzlz5rB06dKyCd4YY0yplHutMNdIVY13X7O9PZiIcOmllwKQ\nm5tLbm4uRcytNMYY4yf+qBV2Xk6ePEl8fDwhISEkJSXRpk0bXx7eGGPMefLXGMtDIrLGvVVWt6iN\n8pd02bfPmZgfGBhIRkYGmZmZLF++nHXr1hW1uzHGGD/wR2J5A2gMxANZwKtFbaiqY1U1QVUT6tWr\nV+C7OnXq0L59e+bMmVOmwRpjjCmZck8sqrpHVU+q6ilgHHC9t/vu27ePAwcOAHDs2DHmzZtHdHR0\nGUVqjDGmNMq0pEthRCRUVbPcj90Br+9lZWVlkZyczMmTJzl16hS9e/e2lSSNMaaC8UetsJtFJB5Q\nYDsw2NvjtWjRglWrVpVBpMYYY3ylTBOLqvYppHl8WZ7TGGOMf1WqmffGGGMqvkqVWAYMGEBISAjN\nmjXzdyjGGGOK4I+SLh/mK+eyXUQyvD1ev3797PFiY4yp4Mq9pIuq3nW6nAvwETDD24MlJiYSHBzs\n2wiNMcb4VFkP3i9017w/izhFvnoDHcoyBmOMMeXLn2Ms7YA9qrq1qA0KK+lijDGmYvNnYukDTDnX\nBucq6WKMMaZiKveZ9wAiEgT0AFr74/zGGGPKjr+uWDoBm1Q1syQ79enThxtuuIHNmzcTHh7O+PE2\n19IYYyqaci/poqrjgbsp5jZYYaZMKfEuxhhjypk/Srqgqv3K8rzGGGP8p1LNvDfGGFPxVarEYiVd\njDGm4vNHSZeWIrJERNaKyKciUtvb41lJF2OMqfjKvaQL8DYwQlWbAx8Dw709mJV0McaYiq9ME4uq\nLgR+OqP5GmCh+34e0LMsYzDGGFO+/DHGsh64w31/JxDhhxiMMcaUEX8klgHAb0UkHagFnChqQ6sV\nZowxlU+5JxZV3aSqt6hqa5xJkv89x7ZWK8wYYyqZck8sIhLi/hkA/Al409t9raSLMcZUfOVe0gW4\nVER+524yA3jH2+NZSRdjjKn4/FLSBRhdluc1xhjjP5Vq5r0xxpiKzxKLMcYYn7LEYowxxqcssRhj\njPEpSyzGGGN8yhKLMcYYn7LEYowxxqcssRhjjPEpSyzGGGN8SlTV3zF4RUQOA5v9HYcfXQH84O8g\n/KQq9x2s/9b/8+t/I1Ut1yq+ZVrSxcc2q2qCv4PwFxFJq6r9r8p9B+u/9b/y9d9uhRljjPEpSyzG\nGGN8qjIllrH+DsDPqnL/q3Lfwfpv/a9kKs3gvTHGmMqhMl2xGGOMqQQqfGIRkS4isllEvhWREf6O\npyREJEJEvhSRDSKyXkQecduDRWSeiGx1/6ybb58n3b5uFpHO+dpbi8ha97v/ExFx22uIyIdu+zIR\nicy3T7J7jq0iklx+PS9IRAJFZJWIzHI/V5n+i0gdEfmXiGwSkY0ickNV6b+I/MH9e79ORKaIyEUX\net9FZIKI7BWRdfna/NpnEbnK3fZbd9/qZf1zQFUr7AsIBP4LNAaqA6uBWH/HVYL4Q4FW7vtawBYg\nFngZGOG2jwBect/Hun2sAVzl9j3Q/W450BYQ4N/ArW77b4E33fd3Ax+674OB79w/67rv6/rp5/Ao\n8AEwy/1cZfoPTAIGue+rA3WqQv+BMGAbcLH7eRrQ70LvO5AItALW5Wvza5/dn/3d7vs3gQfL/OdQ\n3v/QSvgf6Qbg83yfnwSe9Hdc59GfVCAJZ6JnqNsWijNH56z+AZ+7P4NQYFO+9j7AW/m3cd8H4Uyk\nkvzbuN+9BfTxQ5/DgflAB35OLFWi/8BlOL9c5Yz2C77/OIllp/uLLgiYBdxSRfoeScHE4rc+u9/9\nAAS57QV+p5bVq6LfCjv9l/O0TLet0nEvWa8FlgH1VTXL/ep/QH33fVH9DXPfn9leYB9VzQMOApef\n41jlbRTwOHAqX1tV6f9VwD7gHfdW4NsicglVoP+qugv4O7ADyAIOqupcqkDfC+HPPl8OHHC3PfNY\nZaaiJ5YLgohcCnwEDFXVQ/m/U+d/Iy7IR/NE5HZgr6qmF7XNhdx/nP+jbAW8oarXAtk4t0I8LtT+\nu+MId+Ak1wbAJSJyb/5tLtS+n0tV6XNFTyy7gIh8n8PdtkpDRKrhJJXJqjrDbd4jIqHu96HAXre9\nqP7uct+f2V5gHxEJwrn98uM5jlWefgF0E5HtwFSgg4i8T9XpfyaQqarL3M//wkk0VaH/nYBtqrpP\nVXOBGcCNVI2+n8mfff4RqONue+axyk553Xcs5b3KIJxBqKv4efA+zt9xlSB+Ad4FRp3R/goFB/Ne\ndt/HUXAw7zuKHszr6rb/joKDedPc98E49/fruq9tQLAffxY38/MYS5XpP7AIaOq+f8bt+wXff6AN\nsB6o6cY8CXioivQ9koJjLH7tMzCdgoP3vy3zn0F5/0MrxX+krjhPU/0XeMrf8ZQw9ptwLnvXABnu\nqyvOfc/5wFbgi/x/6YGn3L5uxn0SxG1PANa53/2Tnye3XuT+xfnW/cvYON8+A9z2b4H+fv5Z3MzP\niaXK9B+IB9LcvwOfuP/oq0T/gWeBTW7c7+H8Ar2g+w5MwRlTysW5Yh3o7z7jPFW73G2fDtQo65+D\nzbw3xhjjUxV9jMUYY0wlY4nFGGOMT1liMcYY41OWWIwxxviUJRZjjDE+ZYnFGGOMT1liMVWSiPzx\njM//8VcsJSEiz4jIMH/HYcy5WGIxVVWBxKKqN/orkPIiDvs3b8qc/SUzFZaIPCUiW0RksbtQ1DAR\n+UpEEtzvr3DrkJ1eTOwVEVkhImtEZLDbHioiC0Ukw11wqp2IpAAXu22T3e2OuH+Ke5x17kJLd7nt\nN7vnPr1o1+TTiy8VEft2EXlWRFa6x4l22wtccbjniXRfm0RkotvnySLSSUS+cRduuj7f4VuKyBK3\n/f58xxqer//Pum2R7iJS7+LM5M5fT8qYMhFU/CbGlD8RaY1TCyke5+/pSqDIKsk4pTMOqup1IlID\n+EZE5gI9cNafeF5EAoGaqrpIRH6vqvGFHKeHe86WwBXAChFZ6H53LU59p93ANzhFNhefI6YfVLWV\niPwWGAYMKqbbTYA7cUpzrADuwSkL1A3nCuvX7nYtcOpIXQKsEpHPgGZAFHA9Tn2pmSKSiFO2PgpI\nVtWlxZzfGJ+wxGIqqnbAx6p6FEBEZhaz/S1ACxHp5X6+DOcX6gpggltl+hNVzSjmODcBU1T1JE5V\n2q+B64BDwHJVzXTjycApNniuxHK6mnU6TsIqzjZVXesefz0wX1VVRNa65zotVVWPAcdE5EucZHIT\nzs9glbvNpTj93wF8b0nFlCdLLKayyePnW7gX5WsX4CFV/fzMHdz/c78NmCgi/1DVd0t57px8709S\n/L+f09vn3zZ//FCwD/mPfyrf51NnnOvMAn+K0/8XVfWt/F+4C8xlFxOnMT5lYyymoloI/FpELhaR\nWsCv3PbtQGv3fa98238OPOhemSAi14jIJSLSCNijquOAt3HWQwHIPb3tGRYBd7ljNvVw1jBf7sN+\nbT8dg4i0wimXXlJ3iMhFInI5TtXoFTj9HyDOonKISJiIhPgkYmNKyK5YTIWkqitF5EOc9Sr24vzy\nBGe522ki8gDwWb5d3sa5XbTSHVTfhzMmcTMwXERygSPAfe72Y4E1IrJSVfvmO87HOOuCr8a5Enhc\nVf93evDdBz4C7nNvdS3DWRKipNYAX+KMAf1NVXcDu0UkBljiPlNwBLgX52rJmHJlZfNNpSAizwBH\nVPXv/o7FGHNudivMGGOMT9kVizHnQUQ+5uxxkicKe4jAmKrCEosxxhifslthxhhjfMoSizHGGJ+y\nxGKMMcanLLEYY4zxKUssxhhjfOr/AZDwaOQaoCgVAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fb129472410>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "topic_num_count = df_qtopic.topic_num.value_counts()\n",
    "labels = topic_num_count.index\n",
    "sizes = topic_num_count.values\n",
    "\n",
    "y_pos = np.arange(len(sizes))\n",
    "\n",
    "plt.clf()\n",
    "fig, ax = plt.subplots()\n",
    "ax.barh(y_pos, sizes, align='center', linewidth=0.5)\n",
    "ax.set_yticks(y_pos)\n",
    "ax.set_yticklabels(labels)\n",
    "# 标上数值\n",
    "for x,y in zip(sizes, y_pos):\n",
    "    ax.text(x+8, y+0.3, '%d'%x)\n",
    "ax.invert_yaxis()  # labels read top-to-bottom\n",
    "ax.set_xlabel('question_number')\n",
    "ax.set_ylabel('topic_number')\n",
    "# plt.savefig('../figs/3-tags_distribution.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在训练集中一共提供了将近 **300万 个问题**，每个问题平均有 **2** 个话题标签。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 话题总数与每个话题的问题数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "话题总频数\u001b[1;35m 7022750 \u001b[0m\n",
      "不同的话题个数 \u001b[1;35m 1999 \u001b[0m\t\n"
     ]
    }
   ],
   "source": [
    "from itertools import chain\n",
    "\n",
    "topic_ids = list(chain(*(df_qtopic.topic_ids.values)))\n",
    "print '话题总频数\\033[1;35m %d \\033[0m' % len(topic_ids)\n",
    "\n",
    "sr_topic_ids = pd.Series(topic_ids)\n",
    "topic_ids_count = sr_topic_ids.value_counts()\n",
    "ids = topic_ids_count.index\n",
    "id_count = topic_ids_count.values\n",
    "print '不同的话题个数 \\033[1;35m %d \\033[0m\\t' % len(ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "每个话题下面问题的数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最多 \u001b[1;35m 66259 \u001b[0m 个问题\n",
      "最少 \u001b[1;35m 1636 \u001b[0m 个问题\n",
      "平均 \u001b[1;35m 3513 \u001b[0m 个问题\n"
     ]
    }
   ],
   "source": [
    "print '最多 \\033[1;35m %d \\033[0m 个问题' % max(id_count)\n",
    "print '最少 \\033[1;35m %d \\033[0m 个问题' % min(id_count) \n",
    "print '平均 \\033[1;35m %d \\033[0m 个问题' % np.mean(id_count) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAELCAYAAAD6AKALAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAG91JREFUeJzt3X+wVOWd5/H3J2CII4q/krsUMgNZiRnUjfGyhmTVgsFE\nkpjBJOpguSPJslKzmqypJDvCpGoqU1vs6EwZN66rM0ywRJMRiYkjG8e4itw1bgQDxgRRGa4BVxiU\n8kfAa6ITyHf/OE/roe0L3X2f/nHv/byquvqc7znPOd8+t+kvz3NOn1ZEYGZmlss7Op2AmZmNLC4s\nZmaWlQuLmZll5cJiZmZZubCYmVlWLixmZpaVC4uZmWXlwmJmZlm5sJiZWVZjO51AJxx//PExZcqU\nptq+9tprHHHEEXkTysB5NcZ5NaZb84LuzW0k5rVx48YXI+Ldh1wxIkbdo7e3N5q1du3aptu2kvNq\njPNqTLfmFdG9uY3EvIANUcdnrIfCzMwsKxcWMzPLyoXFzMyycmExM7OsXFjMzCwrFxYzM8vKhcXM\nzLJyYTEzs6xcWMzMLCsXFjMzy8qFxczMsnJhMTOzrFxYzMwsKxcWMzPLyoXFzMyycmExM7OsXFjM\nzCwrFxYzM8vKhcXMzLJyYTEzs6xcWMzMLCsXFjMzy8qFxczMsnJhMTOzrFxYzMwsq5YXFklHS7pT\n0tOSnpL0YUnHSrpf0tb0fExp/SWS+iVtkXRuKd4raVNadr0kpfg4SXek+HpJU1r9mszMbHDt6LF8\nE/hhRLwf+ADwFLAYWBMR04A1aR5J04H5wMnAXOBGSWPSdm4CLgOmpcfcFF8IvBIRJwLXAde04TWZ\nmdkgWlpYJE0AzgaWA0TEv0TEL4F5wIq02grg/DQ9D1gZEW9ExDagHzhD0kTgqIhYFxEB3FrVprKt\nO4E5ld6MmZm1n4rP6RZtXDoNWAY8SdFb2QhcCeyMiKPTOqLocRwt6QZgXUR8Oy1bDtwLbAeujohz\nUvws4KqIOE/SE8DciNiRlj0DfCgiXqzKZRGwCKCnp6d35cqVTb2mgYEBxo8f31TbVnJejXFejenW\nvKB7cxuJec2ePXtjRMw41Hpjm9p6/cYCpwNfjIj1kr5JGvaqiIiQ1Lrq9tZ+llEUOWbMmBGzZs1q\najt9fX0027aVnFdjnFdjujUv6N7cRnNerT7HsgPYERHr0/ydFIXmhTS8RXrenZbvBCaX2p+QYjvT\ndHX8gDaSxgITgJeyvxIzM6tLSwtLRDwPPCfppBSaQzEsthpYkGILgLvT9GpgfrrSayrFSfpHI2IX\nsFfSzDR0dmlVm8q2LgAejFaO75mZ2UG1eigM4IvAdyS9E/gF8HmKgrZK0kLgWeAigIjYLGkVRfHZ\nB1wREfvTdi4HbgEOpzjvcm+KLwduk9QPvExxVZmZmXVIywtLRDwO1DrZM2eQ9ZcCS2vENwCn1Ii/\nDlw4xDTNzCwTf/PezMyycmExM7OsXFjMzCwrFxYzM8vKhcXMzLJyYTEzs6xcWMzMLCsXFjMzy8qF\nxczMsnJhMTOzrFxYzMwsKxcWMzPLyoXFzMyycmExM7OsXFjMzCwrFxYzM8vKhcXMzLJyYTEzs6xc\nWMzMLCsXFjMzy8qFxczMsnJhMTOzrFpeWCRtl7RJ0uOSNqTYsZLul7Q1PR9TWn+JpH5JWySdW4r3\npu30S7peklJ8nKQ7Uny9pCmtfk1mZja4dvVYZkfEaRExI80vBtZExDRgTZpH0nRgPnAyMBe4UdKY\n1OYm4DJgWnrMTfGFwCsRcSJwHXBNG16PmZkNolNDYfOAFWl6BXB+Kb4yIt6IiG1AP3CGpInAURGx\nLiICuLWqTWVbdwJzKr0ZMzNrv3YUlgAekLRR0qIU64mIXWn6eaAnTU8Cniu13ZFik9J0dfyANhGx\nD9gDHJf7RZiZWX3GtmEfZ0bETknvAe6X9HR5YUSEpGh1EqmoLQLo6emhr6+vqe0MDAw03baVnFdj\nnFdjujUv6N7cRnVeEdG2B/B14KvAFmBiik0EtqTpJcCS0vr3AR9O6zxdil8M/G15nTQ9FngR0MHy\n6O3tjWatXbu26bat5Lwa47wa0615RXRvbiMxL2BD1PFZ39KhMElHSDqyMg18DHgCWA0sSKstAO5O\n06uB+elKr6kUJ+kfjWLYbK+kmen8yaVVbSrbugB4MB0AMzPrgFYPhfUAd6Vz6WOBv4+IH0r6CbBK\n0kLgWeAigIjYLGkV8CSwD7giIvanbV0O3AIcDtybHgDLgdsk9QMvU1xVZmZmHdLSwhIRvwA+UCP+\nEjBnkDZLgaU14huAU2rEXwcuHHKyZmaWhb95b2ZmWbmwmJlZVi4sZmaWlQuLmZll5cJiZmZZubCY\nmVlWLixmZpaVC4uZmWXlwmJmZlm5sJiZWVYuLGZmlpULi5mZZeXCYmZmWbmwmJlZVi4sZmaWlQuL\nmZll5cJiZmZZubCYmVlWLixmZpaVC4uZmWXlwmJmZlm5sJiZWVZtKSySxkj6qaQfpPljJd0vaWt6\nPqa07hJJ/ZK2SDq3FO+VtCktu16SUnycpDtSfL2kKe14TWZmVlvdhUXS1Hpig7gSeKo0vxhYExHT\ngDVpHknTgfnAycBc4EZJY1Kbm4DLgGnpMTfFFwKvRMSJwHXANfW+JjMzy6+RHsv3asTuPFQjSScA\nnwS+VQrPA1ak6RXA+aX4yoh4IyK2Af3AGZImAkdFxLqICODWqjaVbd0JzKn0ZszMrP3GHmoFSe+n\n6EFMkPSZ0qKjgHfVsY//DvwpcGQp1hMRu9L080BPmp4ErCuttyPFfpOmq+OVNs8BRMQ+SXuA44AX\n68jNzMwyO2RhAU4CzgOOBj5Vir9KMTQ1KEnnAbsjYqOkWbXWiYiQFPWl2zxJi4BFAD09PfT19TW1\nnYGBgabbtpLzaozzaky35gXdm9uozisi6noAH6533VKbv6ToXWyn6Jn8Cvg2sAWYmNaZCGxJ00uA\nJaX29wEfTus8XYpfDPxteZ00PZaip6KD5dXb2xvNWrt2bdNtW8l5NcZ5NaZb84ro3txGYl7Ahqjj\ns7+Rcyz9kv5M0jJJN1cehyhaSyLihIiYQnFS/sGI+PfAamBBWm0BcHeaXg3MT1d6TaU4Sf9oFMNm\neyXNTOdPLq1qU9nWBWkfLe8BmZlZbfUMhVXcDfwIeADYP8T9Xg2skrQQeBa4CCAiNktaBTwJ7AOu\niIjKvi4HbgEOB+5ND4DlwG2S+oGXKQqYmZl1SCOF5Xci4qpmdxQRfUBfmn4JmDPIekuBpTXiG4BT\nasRfBy5sNi8zM8urkaGwH0j6RMsyMTOzEaGRwnIlRXH5taS9kl6VtLdViZmZ2fBU91BYRBx56LXM\nzGy0q7uwSDq7VjwiHsqXjpmZDXeNnLz/L6XpdwFnABuBP8iakZmZDWuNDIWVv3WPpMkUt2sxMzN7\n01Bum78D+P1ciZiZ2cjQyDmW/wFUvtH+DuA04LFWJGVmZsNXI+dYNpSm9wG3R8T/zZyPmZkNc42c\nY1kh6Z3A+1JoS2tSMjOz4ayRobBZFD+otR0QMFnSAl9ubGZmZY0MhV0LfCwitgBIeh9wO9DbisTM\nzGx4auSqsMMqRQUgIv4JOCx/SmZmNpw1dPJe0rcofqgL4BIOPKFvZmbWUGH5T8AVwH9O8z8Cbsye\nkZmZDWuNFJaxwDcj4hsAksYA41qSlZmZDVuNnGNZQ/HrjRWHU/yapJmZ2ZsaKSzvioiBykya/p38\nKZmZ2XDWSGF5TdLplRlJvcCv86dkZmbDWSPnWL4EfFfSP1N8QfJfAX/UkqzMzGzYqrvHEhE/Ad5P\ncXXYnwC/HxEbK8slfTR/et1pyuJ7Op2CmVnXaui2+RHxm4h4Ij1+U7X4mox5mZnZMDWU32OpprcF\npHdJelTSzyRtlvQXKX6spPslbU3Px5TaLJHUL2mLpHNL8V5Jm9Ky6yUpxcdJuiPF10uakvE1mZlZ\ng3IWlqgRewP4g4j4AMXvt8yVNBNYDKyJiGkUlzEvBpA0HZgPnAzMBW5M35cBuAm4DJiWHnNTfCHw\nSkScCFyHe05mZh2Vs7C8TRQqlygflh4BzKO4UzLp+fw0PQ9YGRFvRMQ2oB84Q9JE4KiIWBcRAdxa\n1aayrTuBOZXejJmZtV/OwrK9VlDSGEmPA7uB+yNiPdATEbvSKs8DPWl6EvBcqfmOFJuUpqvjB7SJ\niH3AHuC4ob4YMzNrziEvN5b0mYMtj4jvp+ea60XEfuA0SUcDd0k6pWp5SKo1jJaVpEXAIoCenh76\n+vqa2s7AwABfOXV/0+1bZWBgoOtyAufVKOfVuG7NbTTnVc/3WD6Vnt8DfAR4MM3PBn4MfL+eHUXE\nLyWtpTg38oKkiRGxKw1z7U6r7QQml5qdkGI703R1vNxmh6SxwATgpRr7XwYsA5gxY0bMmjWrnrTf\npq+vj2sffo3tlzTXvlX6+vpo9jW1kvNqjPNqXLfmNprzOuRQWER8PiI+T3F+ZHpEfDYiPktxgv2g\nv8ci6d2pp4Kkw4GPAk8Dq4EFabUFwN1pejUwP13pNZXiJP2jadhsr6SZ6fzJpVVtKtu6AHgwnYcx\nM7MOaOSb95NL50UAXgB+9xBtJgIr0pVd7wBWRcQPJD0CrJK0EHgWuAggIjZLWgU8CewDrkhDaQCX\nA7dQ3Pzy3vQAWA7cJqkfeJniqjIzM+uQRgrLGkn3UfwcMRS3czno3Y0j4ufAB2vEXwLmDNJmKbC0\nRnwDcEqN+OvAhYdK3szM2qPuwhIRX0gn8s9KoWURcVdr0jIzs+GqkR5L5Qqwuk7Wm5nZ6FTP5cYP\nR8SZkl7lwG/Xi+Jq4aNalp2ZmQ07hywsEXFmej6y9emYmdlw19JbupiZ2ejjwmJmZlm5sDTJP/Zl\nZlabC4uZmWXlwmJmZlm5sJiZWVYuLGZmlpULi5mZZeXCYmZmWbmwmJlZVi4sZmaWlQuLmZll5cJi\nZmZZubCYmVlWLixD4PuFmZm9nQuLmZll5cJiZmZZubCYmVlWLixmZpZVSwuLpMmS1kp6UtJmSVem\n+LGS7pe0NT0fU2qzRFK/pC2Szi3FeyVtSsuul6QUHyfpjhRfL2lKK1+TmZkdXKt7LPuAr0TEdGAm\ncIWk6cBiYE1ETAPWpHnSsvnAycBc4EZJY9K2bgIuA6alx9wUXwi8EhEnAtcB17T4NZmZ2UG0tLBE\nxK6IeCxNvwo8BUwC5gEr0morgPPT9DxgZUS8ERHbgH7gDEkTgaMiYl1EBHBrVZvKtu4E5lR6M2Zm\n1n4qPqfbsKNiiOoh4BTg/0XE0Skuih7H0ZJuANZFxLfTsuXAvcB24OqIOCfFzwKuiojzJD0BzI2I\nHWnZM8CHIuLFqv0vAhYB9PT09K5cubKp1zEwMMC2PfvfnD910oSmtpPbwMAA48eP73Qab+O8GuO8\nGtetuY3EvGbPnr0xImYcar2xTW29QZLGA98DvhQRe8sdiogISS2vbhGxDFgGMGPGjJg1a1ZT2+nr\n6+Pah197c377Jc1tJ7e+vj6afU2t5Lwa47wa1625jea8Wn5VmKTDKIrKdyLi+yn8QhreIj3vTvGd\nwORS8xNSbGearo4f0EbSWGAC8FL+V2JmZvVo9VVhApYDT0XEN0qLVgML0vQC4O5SfH660msqxUn6\nRyNiF7BX0sy0zUur2lS2dQHwYLRrfA/f1sXMrFqrh8L+HfDHwCZJj6fYnwFXA6skLQSeBS4CiIjN\nklYBT1JcUXZFRFROaFwO3AIcTnHe5d4UXw7cJqkfeJniqjIzM+uQlhaWiHgYGOwKrTmDtFkKLK0R\n30Bx4r86/jpw4RDSNDOzjPzNezMzy8qFxczMsnJhMTOzrFxYzMwsKxcWMzPLyoXFzMyycmExM7Os\nXFgy8TfwzcwKLixmZpaVC4uZmWXlwmJmZlm5sJiZWVYuLGZmlpULi5mZZeXCYmZmWbmwmJlZVi4s\nmfmLkmY22rmwmJlZVi4sLeBei5mNZi4sZmaWlQtLi7jXYmajlQuLmZll1dLCIulmSbslPVGKHSvp\nfklb0/MxpWVLJPVL2iLp3FK8V9KmtOx6SUrxcZLuSPH1kqa08vWYmdmhtbrHcgswtyq2GFgTEdOA\nNWkeSdOB+cDJqc2NksakNjcBlwHT0qOyzYXAKxFxInAdcE3LXkkTPBxmZqNRSwtLRDwEvFwVnges\nSNMrgPNL8ZUR8UZEbAP6gTMkTQSOioh1ERHArVVtKtu6E5hT6c10CxcXMxttOnGOpScidqXp54Ge\nND0JeK603o4Um5Smq+MHtImIfcAe4LjWpG1mZvVQ0Qlo4Q6K8x4/iIhT0vwvI+Lo0vJXIuIYSTcA\n6yLi2ym+HLgX2A5cHRHnpPhZwFURcV46dzM3InakZc8AH4qIF2vksQhYBNDT09O7cuXKpl7PwMAA\n2/bsPyB26qQJbNq5583nWk6dNKGp/TWS1/jx41u6j2Y4r8Y4r8Z1a24jMa/Zs2dvjIgZh1pvbFNb\nH5oXJE2MiF1pmGt3iu8EJpfWOyHFdqbp6ni5zQ5JY4EJwEu1dhoRy4BlADNmzIhZs2Y1lXxfXx/X\nPvzaAbHtl8zic4vvefO5lu2XNLe/RvJq9jW1kvNqjPNqXLfmNprz6sRQ2GpgQZpeANxdis9PV3pN\npThJ/2gaNtsraWY6f3JpVZvKti4AHoxWd8HMzOygWn258e3AI8BJknZIWghcDXxU0lbgnDRPRGwG\nVgFPAj8EroiIypjT5cC3KE7oP0MxRAawHDhOUj/wZdIVZt3KJ/LNbDRo6VBYRFw8yKI5g6y/FFha\nI74BOKVG/HXgwqHk2G5TFt/D9qs/2ek0zMxaxt+87wD3XMxsJHNh6RAXFzMbqVxYOsjFxcxGIheW\nDnNxMbORxoXFzMyy6sQXJK2Gcs/FV42Z2XDmHksX8vCYmQ1nLixdasrie1xgzGxYcmEZBlxgzGw4\n8TmWYcTnYcxsOHCPZZiqFBn3Zsys27jHMkJs2rnngFv2u0djZp3iHssIVT757wsBzKyd3GMZRaqL\ny/arP+m7LZtZdi4sVrM3Uyk6teJmZgfjoTBrSPVFA5VhNg+3mVmFeyyWXWV4bcrie/jKqfsOuKgA\nBu8NVZaZ2fDmwmJd52BFp1y06l1WXsfMWs+FxUaVStGp7kkdqiDlLHYucDbSubCYtVl1kSnPlwte\n7oJWa516l1Xyqne/Nrq5sJhZdu0odhW3zD2irUW2nm1vv/qTbNq5h1m091gcKqd28VVhZmajRLuu\n3BwRhUXSXElbJPVLWtzpfMzMRrNhX1gkjQH+J/BxYDpwsaTpnc3KzGz0GvaFBTgD6I+IX0TEvwAr\ngXkdzsnMbNQaCYVlEvBcaX5HipmZWQcoIjqdw5BIugCYGxH/Mc3/MfChiPhC1XqLgEVp9iRgS5O7\nPB54scm2reS8GuO8GtOteUH35jYS8/q9iHj3oVYaCZcb7wQml+ZPSLEDRMQyYNlQdyZpQ0TMGOp2\ncnNejXFejenWvKB7cxvNeY2EobCfANMkTZX0TmA+sLrDOZmZjVrDvscSEfskfQG4DxgD3BwRmzuc\nlpnZqDXsCwtARPwj8I9t2t2Qh9NaxHk1xnk1plvzgu7NbdTmNexP3puZWXcZCedYzMysi7iwNKBT\nt46RNFnSWklPStos6coU/7qknZIeT49PlNosSXlukXRuC3PbLmlT2v+GFDtW0v2StqbnYzqQ10ml\n4/K4pL2SvtSJYybpZkm7JT1RijV8jCT1pmPdL+l6SWpBXn8t6WlJP5d0l6SjU3yKpF+XjtvftDmv\nhv9ubcrrjlJO2yU9nuLtPF6DfT507j0WEX7U8aC4MOAZ4L3AO4GfAdPbtO+JwOlp+kjgnyhuX/N1\n4Ks11p+e8hsHTE15j2lRbtuB46tifwUsTtOLgWvanVeNv93zwO914pgBZwOnA08M5RgBjwIzAQH3\nAh9vQV4fA8am6WtKeU0pr1e1nXbk1fDfrR15VS2/FvjzDhyvwT4fOvYec4+lfh27dUxE7IqIx9L0\nq8BTHPzuAvOAlRHxRkRsA/op8m+XecCKNL0COL/Dec0BnomIZw+yTstyi4iHgJdr7K/uYyRpInBU\nRKyL4hPg1lKbbHlFxP+OiH1pdh3F98IG1a68DqKjx6si/c/+IuD2g22jRXkN9vnQsfeYC0v9uuLW\nMZKmAB8E1qfQF9Owxc2lrm47cw3gAUkbVdzdAKAnInal6eeBng7kVTafA//Bd/qYQePHaFKabld+\nAP+B4n+tFVPTsM7/kXRWirUzr0b+bu0+XmcBL0TE1lKs7cer6vOhY+8xF5ZhRNJ44HvAlyJiL3AT\nxdDcacAuiq54u50ZEadR3F36Cklnlxem//l07NJDFV+a/UPguynUDcfsAJ0+RrVI+hqwD/hOCu0C\nfjf9rb8M/L2ko9qYUtf93apczIH/eWn78arx+fCmdr/HXFjqV9etY1pF0mEUb5rvRMT3ASLihYjY\nHxG/Bf6Ot4Zu2pZrROxMz7uBu1IOL6RudaXrv7vdeZV8HHgsIl5IeXb8mCWNHqOdHDgs1bL8JH0O\nOA+4JH0gkYZNXkrTGynG5d/Xrrya+Lu183iNBT4D3FHKt63Hq9bnAx18j7mw1K9jt45J47fLgaci\n4hul+MTSap8GKlerrAbmSxonaSowjeKkXO68jpB0ZGWa4sTvE2n/C9JqC4C725lXlQP+J9npY1bS\n0DFKQxp7Jc1M74dLS22ykTQX+FPgDyPiV6X4u1X89hGS3pvy+kUb82ro79auvJJzgKcj4s1hpHYe\nr8E+H+jke2woVyOMtgfwCYorLp4BvtbG/Z5J0Y39OfB4enwCuA3YlOKrgYmlNl9LeW5hiFedHCSv\n91JcXfIzYHPlmADHAWuArcADwLHtzKu0ryOAl4AJpVjbjxlFYdsF/IZi3HphM8cImEHxgfoMcAPp\nC86Z8+qnGH+vvM/+Jq372fQ3fhx4DPhUm/Nq+O/WjrxS/BbgT6rWbefxGuzzoWPvMX/z3szMsvJQ\nmJmZZeXCYmZmWbmwmJlZVi4sZmaWlQuLmZll5cJiZmZZubCYDYGkHw8Sv0XSBe3Ox6wbuLCYDUFE\nfKTTOZh1GxcWsyGQNJCeJemG9MNJDwDvOUS77ZL+QtJj6YeV3p/iZ0h6RNJPJf1Y0kkp/jlJ/6Di\nB5u2S/qCpC+n9dZJOjat968l/TDdbfpHle2atZMLi1kenwZOovgRpUuBenoyL0bE6RR37v1qij0N\nnBURHwT+HPhvpfVPobjZ4b8FlgK/Sus9kvYJsAz4YkT0pm3eOJQXZdaMsZ1OwGyEOBu4PSL2A/8s\n6cE62lTuQruRomAATABWSJpGcf+nw0rrr43ih5xelbQH+F8pvgn4N+m26R8Bvqu3flF2XLMvyKxZ\nLixmnfNGet7PW/8W/ytFAfl0+tGmvhrrA/y2NP/b1P4dwC+j+A0Qs47xUJhZHg8BfyRpTLrF++wm\ntzOBt34D43ONNIzix522SboQ3jzv84Em8zBrmguLWR53Udye/EmK3wp/pMnt/BXwl5J+SnMjCpcA\nCyVVfspgXpN5mDXNt803M7Os3GMxM7OsfPLerIUk3QVMrQpfFRH3dSIfs3bwUJiZmWXloTAzM8vK\nhcXMzLJyYTEzs6xcWMzMLCsXFjMzy+r/A6XmAaHObuODAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f71fc391210>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "x_pot = np.arange(len(ids)) + 1\n",
    "plt.bar(x_pot, id_count)\n",
    "plt.xlabel('id_name')\n",
    "plt.ylabel('id_count')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从上面的分布来看，这 1999 类数据的分布还算比较均衡，样本比较多的类别应该是 topic 层次比较高的话题。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试数据分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试问题数量 \u001b[1;35m 217360 \u001b[0m \n"
     ]
    }
   ],
   "source": [
    "df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\\t', names=['question_id', 'ch_title', 'word_title', 'ch_content', 'word_content'],\n",
    "                     dtype={'question_id': object})\n",
    "print '测试问题数量 \\033[1;35m %d \\033[0m ' % len(df_eval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_id</th>\n",
       "      <th>ch_title</th>\n",
       "      <th>word_title</th>\n",
       "      <th>ch_content</th>\n",
       "      <th>word_content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6215603645409872328</td>\n",
       "      <td>c924,c531,c102,c284,c188,c104,c98,c107,c11,c11...</td>\n",
       "      <td>w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47...</td>\n",
       "      <td>c1128,c529,c636,c572,c1321,c139,c540,c223,c510...</td>\n",
       "      <td>w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6649324930261961840</td>\n",
       "      <td>c346,c1549,c413,c294,c675,c504,c183,c74,c541,c...</td>\n",
       "      <td>w40132,w1357,w1556,w1380,w2464,w33,w16791,w109...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-4251899610700378615</td>\n",
       "      <td>c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10...</td>\n",
       "      <td>w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w...</td>\n",
       "      <td>c149,c148,c148,c42,c185,c95,c95,c186,c186,c186...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            question_id                                           ch_title  \\\n",
       "0   6215603645409872328  c924,c531,c102,c284,c188,c104,c98,c107,c11,c11...   \n",
       "1   6649324930261961840  c346,c1549,c413,c294,c675,c504,c183,c74,c541,c...   \n",
       "2  -4251899610700378615  c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10...   \n",
       "\n",
       "                                          word_title  \\\n",
       "0  w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47...   \n",
       "1  w40132,w1357,w1556,w1380,w2464,w33,w16791,w109...   \n",
       "2  w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w...   \n",
       "\n",
       "                                          ch_content  \\\n",
       "0  c1128,c529,c636,c572,c1321,c139,c540,c223,c510...   \n",
       "1                                                NaN   \n",
       "2  c149,c148,c148,c42,c185,c95,c95,c186,c186,c186...   \n",
       "\n",
       "                                        word_content  \n",
       "0  w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2...  \n",
       "1                                                NaN  \n",
       "2                                                NaN  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_eval.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "没有内容的问题 \u001b[1;35m 55179 \u001b[0m  \n",
      "一共有 0.25386 没有问题描述\n"
     ]
    }
   ],
   "source": [
    "na_count = 0\n",
    "ch_contents = df_eval.ch_content.values\n",
    "for ch_content in ch_contents:\n",
    "    if type(ch_content) is float:\n",
    "        na_count += 1\n",
    "print '没有内容的问题 \\033[1;35m %d \\033[0m  ' % na_count\n",
    "print '一共有 %g 没有问题描述' % (na_count / len(df_eval))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在测试集中，一共有 25.38% 没有问题描述。**有 3 个问题的 title 分词丢失了。暂时使用 ch 来替代.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "110081\n",
      "166420\n",
      "194409\n",
      "没有题目的问题数量 \u001b[1;35m 3 \u001b[0m  \n"
     ]
    }
   ],
   "source": [
    "word_titles = df_eval.word_title.values\n",
    "\n",
    "na_count = 0\n",
    "na_indexs = list()\n",
    "for i in xrange(len(word_titles)):\n",
    "    word_title = word_titles[i]\n",
    "    if type(word_title) is float:\n",
    "        na_indexs.append(i)\n",
    "        na_count += 1\n",
    "        print i\n",
    "        \n",
    "print '没有题目的问题数量 \\033[1;35m %d \\033[0m  ' % na_count\n",
    "\n",
    "for na_index in na_indexs:\n",
    "    df_eval.loc[na_index, 'word_title'] = df_eval.loc[na_index, 'ch_title']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "question_id                                  -4720539803178200791\n",
       "ch_title        c149,c148,c148,c42,c185,c95,c95,c42,c147,c104,...\n",
       "word_title      c149,c148,c148,c42,c185,c95,c95,c42,c147,c104,...\n",
       "ch_content      c41,c147,c101,c149,c144,c100,c279,c277,c278,c1...\n",
       "word_content                     w3856,w6,w11610,w469,w625,w31521\n",
       "Name: 194409, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_eval.iloc[194409]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 话题 title\n",
    "df_eval['word_title'] = df_eval.word_title.apply(lambda text: text.split(','))\n",
    "df_eval['ch_title'] = df_eval.ch_title.apply(lambda text: text.split(','))\n",
    "df_eval['wdtitle_len'] = df_eval.word_title.apply(lambda ws: len(ws))\n",
    "df_eval['chtitle_len'] = df_eval.ch_title.apply(lambda chs: len(chs))\n",
    "# 将 描述非空的部分提取出来分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "非空描述数量 157126 \n"
     ]
    }
   ],
   "source": [
    "index_content = df_eval.word_content.apply(lambda wc: type(wc) is not float)\n",
    "print '非空描述数量 %d ' % sum(index_content)\n",
    "df_eval_content = df_eval.loc[index_content, :].copy()  # 把带有描述部分取出来\n",
    "# 话题 描述\n",
    "df_eval_content['word_content'] = df_eval_content.word_content.apply(lambda text: text.split(','))\n",
    "df_eval_content['ch_content'] = df_eval_content.ch_content.apply(lambda text: text.split(','))\n",
    "df_eval_content['wdcontent_len'] = df_eval_content.word_content.apply(lambda ws: len(ws))\n",
    "df_eval_content['chcontent_len'] = df_eval_content.ch_content.apply(lambda chs: len(chs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_id</th>\n",
       "      <th>ch_title</th>\n",
       "      <th>word_title</th>\n",
       "      <th>ch_content</th>\n",
       "      <th>word_content</th>\n",
       "      <th>wdtitle_len</th>\n",
       "      <th>chtitle_len</th>\n",
       "      <th>wdcontent_len</th>\n",
       "      <th>chcontent_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6215603645409872328</td>\n",
       "      <td>[c924, c531, c102, c284, c188, c104, c98, c107...</td>\n",
       "      <td>[w1340, w1341, w55, w1344, w58, w6, w24178, w2...</td>\n",
       "      <td>[c1128, c529, c636, c572, c1321, c139, c540, c...</td>\n",
       "      <td>[w4094, w1618, w20104, w19234, w1097, w1005, w...</td>\n",
       "      <td>18</td>\n",
       "      <td>29</td>\n",
       "      <td>22</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6213817087034420233</td>\n",
       "      <td>[c504, c157, c221, c221, c633, c468, c469, c16...</td>\n",
       "      <td>[w5083, w12537, w10427, w29724, w6, w2566, w11...</td>\n",
       "      <td>[c15, c131, c39, c40, c85, c166, c969, c2456, ...</td>\n",
       "      <td>[w2550, w24, w239, w98, w19456, w11, w108710, ...</td>\n",
       "      <td>12</td>\n",
       "      <td>23</td>\n",
       "      <td>13</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           question_id                                           ch_title  \\\n",
       "0  6215603645409872328  [c924, c531, c102, c284, c188, c104, c98, c107...   \n",
       "3  6213817087034420233  [c504, c157, c221, c221, c633, c468, c469, c16...   \n",
       "\n",
       "                                          word_title  \\\n",
       "0  [w1340, w1341, w55, w1344, w58, w6, w24178, w2...   \n",
       "3  [w5083, w12537, w10427, w29724, w6, w2566, w11...   \n",
       "\n",
       "                                          ch_content  \\\n",
       "0  [c1128, c529, c636, c572, c1321, c139, c540, c...   \n",
       "3  [c15, c131, c39, c40, c85, c166, c969, c2456, ...   \n",
       "\n",
       "                                        word_content  wdtitle_len  \\\n",
       "0  [w4094, w1618, w20104, w19234, w1097, w1005, w...           18   \n",
       "3  [w2550, w24, w239, w98, w19456, w11, w108710, ...           12   \n",
       "\n",
       "   chtitle_len  wdcontent_len  chcontent_len  \n",
       "0           29             22             37  \n",
       "3           23             13             21  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_eval_content.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 测试集话题题目长度分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "** title 长度（词数）\n",
      "max = 76， min = 1, mean = 12, median = 11\n"
     ]
    }
   ],
   "source": [
    "wdtitle_len = df_eval.wdtitle_len.values\n",
    "max_len = max(wdtitle_len)\n",
    "min_len = min(wdtitle_len)\n",
    "mean_len = np.mean(wdtitle_len)\n",
    "median_len = np.median(wdtitle_len)\n",
    "print '** title 长度（词数）'\n",
    "print 'max = %d， min = %d, mean = %d, median = %d' % (max_len, min_len, mean_len, median_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAELCAYAAAA7h+qnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFzBJREFUeJzt3X20XXV95/H314DKyk0DGHonTRkTW2UWmorkolbUda+O\nTkZU1GnpUGt5UKNT63QN6IgPS3E5rsEi2rVoFwhqgzVyizw0GKgOsrhQapXe0OgFkeJDWEMMiTyY\n5gILJvCdP86OHMI9D/ucs8++N3m/1jrrnP3wO/tzd3bu9+6n347MRJJ0YHtG3QEkSfWzGEiSLAaS\nJIuBJAmLgSQJi4EkCYuBJAmLgSQJi4EkCTio7gDdWrZsWa5cubKntg899BCLFy8ebKABMFc55irH\nXOXsr7k2b958X2Ye0XHGzFwQrzVr1mSvbrjhhp7bVslc5ZirHHOVs7/mAqazi9+xHiaSJFkMJEkW\nA0kSFgNJEhYDSRIVF4OIeHZE3BIR34+I2yPik8X4wyPiuoi4q3g/rMockqT2qt4zeBR4TWa+GDgG\nWBsRLwfOAq7PzOcD1xfDkqSaVFoMistcZ4vBg4tXAicClxTjLwHeUmUOSVJ7lZ8ziIhFEbEF2Alc\nl5nfA0Yzc3sxy73AaNU5JEmtReMGtSEsKOJQ4Crg/cDNmXlo07QHM/Np5w0iYh2wDmB0dHTN5ORk\nT8uenZ1lZGSkp7ZVqjLXzLZdPbddtXTRAbe++mGucsxVTr+5JiYmNmfmWKf5hlYMACLi48DDwLuB\n8czcHhHLganMPKpd27GxsZyenu5puVNTU4yPj/fUtkpV5lp51jU9t12/dvEBt776Ya5yzFVOv7ki\noqtiUPXVREcUewRExCHA64AfAVcDpxSznQJsrDKHJKm9qnstXQ5cEhGLaBSeyzJzU0T8E3BZRLwT\nuBs4qeIckqQ2Ki0GmfkD4CVzjL8feG2Vy5Ykdc87kCVJFgNJ0gJ60lk/Zrbt4tQer67Zes4JA04j\nSfOPewaSJIuBJMliIEnCYiBJwmIgScJiIEnCYiBJwmIgScJiIEnCYiBJwmIgScJiIEnCYiBJ4gDp\ntbQf/TxLGOz1VNLC4J6BJMliIEmyGEiSsBhIkrAYSJKwGEiSsBhIkrAYSJKwGEiSqLgYRMSREXFD\nRPwwIm6PiD8rxp8dEdsiYkvxekOVOSRJ7VXdHcUe4MzMvDUilgCbI+K6YtrnM/OzFS9fktSFSotB\nZm4Hthefd0fEHcCKKpcpSSovMnM4C4pYCdwEvAg4AzgN2AVM09h7eHCONuuAdQCjo6NrJicne1r2\nzgd2seORnpr2bfWKpS2nzc7OMjIyUslyZ7bt6rntqqWLKsvVjyrXVz/MVY65yuk318TExObMHOs0\n31CKQUSMADcCn87MKyNiFLgPSOBTwPLMPL3dd4yNjeX09HRPyz9/w0bOm6mng9Z2vZZOTU0xPj5e\nyXL76W11/drFleXqR5Xrqx/mKsdc5fSbKyK6KgaVX00UEQcDVwAbMvNKgMzckZmPZ+YTwMXAS6vO\nIUlqreqriQL4EnBHZn6uafzyptneCtxWZQ5JUntVHzs5HngHMBMRW4pxHwFOjohjaBwm2gq8p+Ic\nkqQ2qr6a6GYg5ph0bZXLlSSV4x3IkiSLgSTJYiBJwmIgScJiIEnCYiBJwmIgScJiIEnCYiBJwmIg\nScJiIEnCYiBJwmIgScJiIEnCYiBJwmIgSaLLYhANR1YdRpJUj66KQWYmPp1MkvZbZQ4T3RoRx1WW\nRJJUmzLPQH4Z8PaIuBt4iMazjTMzf6eSZJKkoSlTDP5TZSkkSbXq+jBRZt4NHAm8pvj8cJn2kqT5\nq+tf5hHxCeBDwIeLUQcDX60ilCRpuMr8Zf9W4M00zheQmT8HllQRSpI0XGWKwWPFJaYJEBGLq4kk\nSRq2MsXgsoj4AnBoRLwb+DZwcbsGEXFkRNwQET+MiNsj4s+K8YdHxHURcVfxfljvP4IkqV9lTiB/\nFrgcuAJ4AfDxzDy/Q7M9wJmZeTTwcuB9EXE0cBZwfWY+H7i+GJYk1aTMpaUAM8AhNA4VzXSaOTO3\nA9uLz7sj4g5gBXAiMF7MdgkwRePktCSpBmWuJnoXcAvwNuD3gO9GxOkl2q8EXgJ8DxgtCgXAvcBo\nt98jSRq8aJwT7mLGiDuBV2Tm/cXwc4DvZOZRXbQdAW4EPp2ZV0bELzPz0KbpD2bm084bRMQ6YB3A\n6OjomsnJya6y7mvnA7vY8UhPTfu2esXSltNmZ2cZGRmpZLkz23b13HbV0kWV5epHleurH+Yqx1zl\n9JtrYmJic2aOdZqvzGGi+4HdTcO7i3FtRcTBNM4zbMjMK4vROyJieWZuj4jlwM652mbmRcBFAGNj\nYzk+Pl4i7pPO37CR82bKHhEbjK1vH285bWpqil5/pk5OPeuantuuX7u4slz9qHJ99cNc5ZirnGHl\n6vgbMiLOKD7+GPheRGykcc7gROAHHdoG8CXgjsz8XNOkq4FTgHOK943lo0uSBqWbP5f33lj2k+K1\nVze/wI8H3gHMRMSWYtxHaBSByyLincDdwEndxZUkVaFjMcjMT/b65Zl5M43eTefy2l6/V5I0WF0f\nSI+IMeCjwHOb29mFtSQtfGXOqm4APkjj/oInqokjSapDmWLwi8y8urIk2i+s7OMqJoCt55wwoCSS\nyihTDD4REV+k0X3Eo3tHNl0uKklaoMoUg9OA/0DjOQZ7DxMlYDGQpAWuTDE4rpu7jSVJC0+ZLqy/\nU/Q4Kknaz5TZM3g5sCUifkbjnEEA6aWlkrTwlSkGaytLIUmqVZli0F33pnqKdpdanrl6T9sO5bzM\nUtKwlCkG19AoCAE8G1gF3Am8sIJckqQh6roYZObq5uGIOBb4k4EnkiQNXZmriZ4iM28FXjbALJKk\nmpTpqO6MpsFnAMcCPx94IknS0JU5Z7Ck6fMeGucQrhhsHElSHcqcM+j5uQaSpPmtzGGiFwAfAFby\n1OcZvGbwsVSnmW27+nqGsqSFp8xhoq8DFwJfBB6vJo4kqQ5lisGezLygsiSSpNqUubT0GxHxJxGx\nPCIO3/uqLJkkaWjK7BmcUrx/sGlcAs8bXBxJUh3KXE20qt30iHhdZl7XfyRJ0rCV2TPo5DOAxWCA\n+n2esCR1q+fuKOYQA/wuSdIQDbIY2MW1JC1QgywGkqQFapDFYOu+IyLiyxGxMyJuaxp3dkRsi4gt\nxesNA8wgSepBqRPIEfEKnt4dxVeK97fN0WQ98JfAV/YZ//nM/GyZZUuSqlOmb6K/AX4L2MKT3VEk\nT/9F/yuZeVNErOwjnyRpCCKzu/O+EXEHcHR22+DJdiuBTZn5omL4bOA0YBcwDZyZmQ+2aLsOWAcw\nOjq6ZnJyssyif2XnA7vY8UhPTSs1egjm2sfqFUtbTpudnWVkZGSIabpjrnLMVU6/uSYmJjZn5lin\n+coUg68D/z0zt5cJMkcxGAXuo7FX8SlgeWae3ul7xsbGcnp6usyif+X8DRs5b2aQt1QMxpmr95hr\nH1vPOaHltKmpKcbHx4cXpkvmKsdc5fSbKyK6KgZl/scvA34YEbcAj+4dmZlvLhMsM3c0hbwY2FSm\nvSRp8MoUg7MHscCIWN60d/FW4LZ280uSqlemb6Ibi0M8xxWjbsnMne3aRMSlwDiwLCLuAT4BjEfE\nMTQOE20F3tNDbknSAJW5mugk4FxgikbXE+dHxAcz8/JWbTLz5DlGf6lsSElStcocJvoocNzevYGI\nOAL4NtCyGEiSFoYyxeAZ+xwWuh+7s9A80k8vr+2uYpIOBGWKwTcj4lvApcXwHwDXDj6SJGnYypxA\n/mBE/Bfg+GLURZl5VTWxJEnDVOrOosy8AriioiySpJp0LAYRcXNmvjIidvPUZxYEkJn5a5WlkyQN\nRcdikJmvLN6XVB9HklSHrq8GKnot7ThOkrTwlDln8MLmgYg4CFgz2Dg60LW7PPTM1Xs4tY/LR3td\nbifr1y4eYBKpHh33DCLiw8X5gt+JiH8rXruBHcDGyhNKkirXsRhk5v8uzhecm5m/VryWZOZzMvPD\nQ8goSapYmTuIN0XEYoCI+KOI+FxEPLeiXJKkISpTDC4AHo6IFwNnAj+hzSMvJUkLR5lisKd45OWJ\nwF9m5l8BXm4qSfuBMlcT7Y6IDwPvAF4VEc8ADq4mliRpmMoUgz8A/hA4PTPvjYh/T+P5BpKkNhbC\npctdHybKzHtp9Ev0rGLUfYAd1UnSfqDMHcjvpvEgmy8Uo1YAf1dFKEnScJU5gfw+Gt1X/xtAZt4F\n/HoVoSRJw1WmGDyamY/tHSi6o8g280uSFogyxeDGiPgIcEhEvA74OvCNamJJkoapTDE4C/gFMAO8\nh8YjLz9WRShJ0nCVeezlE8DFxUvSAtfP5Y5bzzlhgEk0H3RdDCLiZ8xxjiAznzfQRJKkoStz09lY\n0+dnA78PHD7YOJKkOpS56ez+pte2zPwLoO2+YkR8OSJ2RsRtTeMOj4jrIuKu4v2wPvJLkgagzE1n\nxza9xiLivXTes1gPrN1n3FnA9Zn5fOD6YliSVKMyh4nO48lzBnuArTQOFbWUmTdFxMp9Rp8IjBef\nLwGmgA+VyCFJGrAyxWATjWIQxXACb4xoDGbm57r8ntHM3F58vhcYLZFBklSBaDyioIsZI74GHEfj\nuccBvAm4BbgLIDM/2aLdSmBTZr6oGP5lZh7aNP3BzJzzvEFErAPWAYyOjq6ZnJzsKuu+dj6wix2P\n9NS0UqOHYK4S5muuVUsXMTIyUsuyZ7btajmtyvW1esXSntvOzs7Wtr7aqTJXu3+nTvrdviYmJjZn\n5lin+coUg5uAEzJzdzG8BLgmM1/dod1KnloM7gTGM3N7RCwHpjLzqE7LHxsby+np6a6y7uv8DRs5\nb6bMTtBwnLl6j7lKmK+51q9dzPj4eC3LbnevQJXrq5/7DKampmpbX+1UmavfLqz7yRURXRWDMncg\njwKPNQ0/Rm+HeK4GTik+n0JjT0OSVKMyfzZ8BbglIvY+w+AtNK4WaikiLqVxsnhZRNwDfAI4B7gs\nIt4J3A2cVDKzJGnAynRH8emI+HvgVcWo0zLzXzq0ObnFpNd2u1xJUvVKHVDMzFuBWyvKIkmqSZlz\nBpKk/dT8uzRD0ry3EB7wPpdOV1+d2mb6/t5Tq3sGkiSLgSTJYiBJwmIgScJiIEnCYiBJwktLpVr1\nc4mmNEjuGUiSLAaSJIuBJAmLgSQJi4EkCYuBJAmLgSQJi4EkCYuBJAmLgSQJi4EkCYuBJAk7qpM0\nZDPbdrV91nAn+/uziOvinoEkyWIgSbIYSJKo8ZxBRGwFdgOPA3syc6yuLJJ0oKv7BPJEZt5XcwZJ\nOuB5mEiSRGRmPQuO+Bmwi8Zhoi9k5kVzzLMOWAcwOjq6ZnJysqdl7XxgFzse6SNsRUYPwVwlmKsc\nc5XTKdfqFUt7/u6Zbbt6brtq6SJGRkZ6bj8xMbG5m8PwdRaDFZm5LSJ+HbgOeH9m3tRq/rGxsZye\nnu5pWedv2Mh5M3UfEXu6M1fvMVcJ5irHXOV0ytXP/Q0r+7ivYv3axYyPj/fcPiK6Kga1HSbKzG3F\n+07gKuCldWWRpANdLcUgIhZHxJK9n4HXA7fVkUWSVN/VRKPAVRGxN8PXMvObNWWRpANeLcUgM38K\nvLiOZUuSns5LSyVJFgNJksVAkoTFQJKExUCShMVAkoTFQJKExUCShMVAkoTFQJKExUCShMVAkoTF\nQJKExUCShMVAkkR9D7eRpAWln+cYLwTuGUiSLAaSJIuBJAmLgSQJi4EkCYuBJAmLgSQJi4EkCYuB\nJIkai0FErI2IOyPixxFxVl05JEk1FYOIWAT8FfCfgaOBkyPi6DqySJLq2zN4KfDjzPxpZj4GTAIn\n1pRFkg54dRWDFcD/bRq+pxgnSapBZObwFxrxe8DazHxXMfwO4GWZ+af7zLcOWFcMHgXc2eMilwH3\n9di2SuYqx1zlmKuc/TXXczPziE4z1dWF9TbgyKbh3yzGPUVmXgRc1O/CImI6M8f6/Z5BM1c55irH\nXOUc6LnqOkz0z8DzI2JVRDwT+K/A1TVlkaQDXi17Bpm5JyL+FPgWsAj4cmbeXkcWSVKNTzrLzGuB\na4e0uL4PNVXEXOWYqxxzlXNA56rlBLIkaX6xOwpJ0sIvBhGxKCL+JSI2FcOHR8R1EXFX8X5Yi3aV\ndocxR65zI+JHEfGDiLgqIg5t0W5rRMxExJaImB5CrrMjYluxvC0R8YYW7Ya9vv62KdPWiNjSol3V\n6+tp3z8ftrEWuWrfxlrkqn0ba5Gr9m0sIg6NiMuLf7c7IuJ3a9u+MnNBv4AzgK8Bm4rhPwfOKj6f\nBXxmjjaLgJ8AzwOeCXwfOLriXK8HDio+f2auXMW0rcCyIa6vs4EPdGgz9PW1z7TzgI/XtL6e9v3z\nYRtrkav2baxFrtq3sU4/c13bGHAJ8K7i8zOBQ+vavhb0nkFE/CZwAvDFptEn0ljBFO9vmaNppd1h\nzJUrM/9PZu4pBr9L496KoWqxvrox9PXVNC2Ak4BLB7W8Aah9G5vLfNjG+lBbFzV1bWMRsRR4NfAl\ngMx8LDN/SU3b14IuBsBfAP8TeKJp3Ghmbi8+3wuMztGu6u4w5srV7HTg71tMS+DbEbE5GndgD1Kr\nXO8vDi18ucUuaZ3r61XAjsy8q0XbKtdXq++fD9tYp5+7rm2s1XfXvY21+5nr2sZWAb8A/ro4RPrF\niFhMTdvXgi0GEfFGYGdmbm41Tzb2p4Z6uVSnXBHxUWAPsKHFV7wyM4+h0aPr+yLi1RXnuoDGruYx\nwHYau8tD08W/48m0/4utkvXV7ffXsY11ylXXNtbmu2vdxtrk2quubewg4Fjggsx8CfAQjcNCvzLM\n7WvBFgPgeODNEbGVxi7SayLiq8COiFgOULzvnKNtV91hDDgXEXEq8Ebg7cU/8tNk5rbifSdwFY3d\nwcpyZeaOzHw8M58ALm6xvLrW10HA24C/bdW4wvXV7vvr3sZa/tw1b2Nzfvc82Mbara86t7F7gHsy\n83vF8OU0ikM921cVJ0WG/QLGefKE6Lk89eTLn88x/0HAT2nspu09+fLCinOtBX4IHNFm/sXAkqbP\n36HRoV+VuZY3jf8fwOR8WF9N6+zGutZXq++vextrk6vWbaxNrlq3sXY/8zzYxv4BOKr4fHaxbdWy\nfQ3kB6r7tc8vt+cA1wN3Ad8GDi/G/wZwbVObNwD/SuOM/EeHkOvHNI7xbSleF+6bi8au9PeL1+1D\nyvU3wAzwAxr9Qy2fD+urGF4PvHefeYa2vlp9f93bWJtctW5jbXLVuo21+5nnwTZ2DDBdrJu/Aw6r\na/vyDmRJ0oI+ZyBJGhCLgSTJYiBJshhIkrAYSJKwGEiSsBjoABcR4/Fkt9njEfGKpmnvjYg/Lj6f\nGhG/0TRtKiJKP6S813ZS1Wp77KU0D40DszTuMiUzL2yadipwG/DzoaeShsBioP1a0QvkZTT6blkE\nfArYRaOn1IeBm4v5VgLvBR6PiD8C3g+8lkZx2AqMARsi4hHgd/dZxuuBTwLPonE36GmZOdtFtjnb\nFf00XQK8CTgY+P3M/FGPq0DqioeJtL9bC/w8M1+cmS8Cvkmjs7Q3AWuAfweQmVuBC4HPZ+YxmfkP\ne78gMy+n0WXA24tpj+ydFhHLgI8B/zEzjy3mO6NTqC7a3VeMvwD4QK8/vNQt9wy0v5sBzouIzwCb\ngN3Az7Lou77oIbWfPupfDhwN/GPjGSk8E/inAbS7snjfTKNXTalSFgPt1zLzXyPiWBqdev0vGh2A\nDVIA12XmyQNu92jx/jj+P9UQeJhI+7XiCqCHM/OrNLoGfgWwMiJ+q5il+ZfxbmBJi69qNe27wPER\n8dvF8hZHxAu6iNZrO6kS/sWh/d1q4NyIeAL4f8B/A5YB10TEwzT6k9/7S/4bwOURcSKNE8jN1gMX\n7nsCOTN/UTxQ5tKIeFYx+mM0uhZuqdd2UlXswlqS5GEiSZKHiaRKRMRVNB5J2OxDmfmtOvJInXiY\nSJLkYSJJksVAkoTFQJKExUCShMVAkgT8fxFhO1YwqinAAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fb064326090>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(wdtitle_len, bins = 20, range=(40, 60))\n",
    "plt.xlabel('sdtitle_len')\n",
    "plt.ylabel('question_number')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "title取最长长度为50；\n",
    "\n",
    "title 的最后一个词最多的是 ？ 此外还要找到句号或者叹号便于进行句子切分。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 测试集话题描述长度分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "** content 长度（词数）\n",
      "max = 2409， min = 1, mean = 80, median = 40\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAELCAYAAAD3HtBMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHuVJREFUeJzt3X+UHWWd5/H3hx+DWZqfwvTGwJi4E50TiAbSIqMw2xEc\nossaVMRwWAFlyDgw/jiiS6J7FHdOzsBqcMdB0SAcQNAmCiwZfowCQ6OMhkgw0CTIEE040hMSRUjS\n0YkmfvePehoqnf5xq3Lr3tt9P69z7umq59ZT9U31zf328zxVTykiMDMzK2qfZgdgZmbjkxOImZmV\n4gRiZmalOIGYmVkpTiBmZlaKE4iZmZXiBGJmZqU4gZiZWSlOIGZmVsp+zQ6gSkcccURMnTq1VN3t\n27dz4IEH1jegOnBcxTiuYhxXMRMxrlWrVv0qIo6saeOImLCv2bNnR1kPPPBA6bpVclzFOK5iHFcx\nEzEu4JGo8TvWXVhmZlaKE4iZmZXiBGJmZqU4gZiZWSlOIGZmVooTiJmZleIEYmZmpTiBmJlZKU4g\nZmZWihPIKKYuvIupC+9qdhhmZi3JCcTMzEpxAjEzs1KcQMzMrBQnEDMzK8UJxMzMSnECMTOzUpxA\nzMyslEoTiKRXSFop6TFJayR9LpUfLuleSU+nn4fl6iyStE7SU5JOy5XPltSX3vuSJFUZu5mZja7q\nFsgO4K0R8QZgFjBX0onAQuD+iJgO3J/WkTQDmA8cA8wFviJp37Svq4ELgenpNbfi2M3MbBSVJpD0\niN2BtLp/egUwD7ghld8AnJGW5wE9EbEjItYD64ATJE0GDo6IFemZvTfm6piZWRMo+z6u8ABZC2IV\n8KfAlyPiUkkvRsSh6X0BL0TEoZKuAlZExE3pvWuBe4ANwOURcWoqPxm4NCJOH+Z4C4AFAJ2dnbN7\nenpKxT0wMMD6LbsAmDnlkFL7qMLAwAAdHR3NDmMPjqsYx1WM4ypmb+KaM2fOqojoqmXb/UodoYCI\n2AXMknQocLukY4e8H5LqlsUiYimwFKCrqyu6u7tL7ae3t5clD20HYMM55fZRhd7eXsr+m6rkuIpx\nXMU4rmIaFVfDrsKKiBeBB8jGLjalbinSz81ps37g6Fy1o1JZf1oeWm5mZk1S9VVYR6aWB5ImAW8D\nfgosB85Lm50H3JGWlwPzJR0gaRrZYPnKiNgIbJV0YuryOjdXp3KekdfMbE9Vd2FNBm5I4yD7AMsi\n4k5JPwKWSboAeAY4CyAi1khaBqwFdgIXpy4wgIuA64FJZOMi91Qcu5mZjaLSBBIRjwPHDVP+PHDK\nCHUWA4uHKX8EOHbPGmZm1gy+E93MzEpxAjEzs1KcQMzMrBQnEDMzK8UJxMzMSnECMTOzUpxAzMys\nFCcQMzMrxQnEzMxKcQIxM7NSnEDMzKwUJxAzMyvFCcTMzEpxAjEzs1KcQMzMrBQnEDMzK8UJxMzM\nSnECMTOzUpxAzMysFCcQMzMrxQnEzMxKcQIxM7NSnEDMzKyUShOIpKMlPSBpraQ1kj6ayi+T1C9p\ndXq9I1dnkaR1kp6SdFqufLakvvTelySpytjNzGx0+1W8/53AJRHxqKSDgFWS7k3vfTEivpDfWNIM\nYD5wDPAq4D5Jr42IXcDVwIXAw8DdwFzgnorjNzOzEVTaAomIjRHxaFreBjwJTBmlyjygJyJ2RMR6\nYB1wgqTJwMERsSIiArgROKPK2M3MbHQNGwORNBU4jqwFAfBhSY9Luk7SYalsCvCLXLVnU9mUtDy0\nvGGmLryLqQvvauQhzcxamrI/6Cs+iNQBPAgsjojbJHUCvwIC+DtgckR8UNJVwIqIuCnVu5asm2oD\ncHlEnJrKTwYujYjThznWAmABQGdn5+yenp5SMQ8MDLB+y649ymdOOaTU/uplYGCAjo6OpsYwHMdV\njOMqxnEVszdxzZkzZ1VEdNWybdVjIEjaH7gVuDkibgOIiE25968B7kyr/cDRuepHpbL+tDy0fA8R\nsRRYCtDV1RXd3d2l4u7t7WXJQ9v3KN9wTrn91Utvby9l/01VclzFOK5iHFcxjYqr6quwBFwLPBkR\nV+bKJ+c2exfwRFpeDsyXdICkacB0YGVEbAS2Sjox7fNc4I4qYzczs9FV3QJ5C/B+oE/S6lT2KeBs\nSbPIurA2AH8NEBFrJC0D1pJdwXVxugIL4CLgemASWbeWr8AyM2uiShNIRDwEDHe/xt2j1FkMLB6m\n/BHg2PpFZ2Zme8N3opuZWSlOIGZmVooTiJmZleIEYmZmpTiBmJlZKU4gFfLUJ2Y2kTmB7CXPkWVm\n7aqmBKLM0WNv2R6cNMzMakwgaQr1EW/+MzOz9lOkC+tRSW+sLBIzMxtXikxl8ibgHEnPANvJpiiJ\niHh9JZGZmVlLK5JATht7EzMzaxc1d2FFxDNkz+p4a1r+TZH6ZmY2sdScACR9FrgUWJSK9gduqiKo\n8ajIlVm+gsvMJoIiLYh3Ae8kG/8gIv4dOKiKoCYSX/JrZhNVkQTyu3Q5bwBIOrCakMY3JwszaxdF\nEsgySV8DDpV0IXAfcE01YZmZWaur+SqsiPiCpLcBW4HXAp+JiHsri2wccyvEzNpB0Ufa9pE9kzzS\nshXgxGJmE0mRq7D+ClgJvBs4E1gh6YNVBTbRDE0eHlw3s/GuSAvkk8BxEfE8gKRXAj8ErqsiMDMz\na21FBtGfB7bl1relsrZVjxaEWyFmNl6N2QKR9PG0uA54WNIdZGMg84DHK4zNzMxaWC1dWIM3C/4s\nvQbdUf9w2tNgK2TD5f+tyZGYmdVuzAQSEZ8ru/P0EKobgU6yVsvSiPgHSYcDtwBTgQ3AWRHxQqqz\nCLgA2AV8JCK+m8pnA9eTXQV2N/DRdGOjmZk1QZGrsLok3S7pUUmPD77GqLYTuCQiZgAnAhdLmgEs\nBO6PiOnA/Wmd9N584BhgLvAVSfumfV0NXAhMT6+5Nf8rzcys7opchXUz2ZVYfcAfaqkQERuBjWl5\nm6QngSlk4yfdabMbgF6yiRrnAT0RsQNYL2kdcIKkDcDBEbECQNKNwBnAPQXiNzOzOiqSQH4ZEcvL\nHkjSVOA44GGgMyUXgOfIurggSy4rctWeTWW/T8tDy83MrElU6zCCpFOAs8m6nHYMlkfEbTXU7QAe\nBBZHxG2SXoyIQ3PvvxARh0m6ClgRETel8mvJWhkbgMsj4tRUfjJwaUScPsyxFgALADo7O2f39PTU\n9O8bamBggPVbdu1RPnPKIfT1bym1z7Hk9z1zyiEjxtXR0VHJ8feG4yrGcRXjuIrZm7jmzJmzKiK6\natm2SAvkA8CfkT0HZLALK4BRE4ik/YFbgZtzyWaTpMkRsVHSZGBzKu8ne2jVoKNSWX9aHlq+h4hY\nCiwF6Orqiu7u7pr+cUP19vay5KHte5RvOKeb86u6d6NvOy/9Svq2D3tVVm9vL2X/TVVyXMU4rmIc\nVzGNiqtIAnljRLyuyM4lCbgWeDIirsy9tRw4D7g8/bwjV/5NSVcCryIbLF8ZEbskbZV0IlkX2LnA\nPxaJxczM6qtIAvmhpBkRsbZAnbcA7wf6JK1OZZ8iSxzLJF0APAOcBRARayQtA9aSXcF1cUQM9iNd\nxMuX8d5DGwyg5+8PGVy+fq4fw2JmraFIAjkRWC1pPdkYiICIiNePVCEiHkrbDeeUEeosBhYPU/4I\ncGyBeCcMT3diZq2oSALxfRdmZvaSIgnEd32bmdlLiiSQu8iSiIBXANOAp8juGrcG6evf8tIdmGZm\nzVTkkbYz8+uSjicb2DYzszZU5Hkgu4mIR4E31TEWMzMbR2pugeSeCwJZ4jke+Pe6R2RmZuNCkTGQ\ng3LLO8nGRG6tbzhmZjZeFBkDKf1cEDMzm3iKdGG9FvgE2UOgXqoXEW+tf1g2Gj/B0MxaQZEurG8D\nXwW+Tva0QDMza2NFEsjOiLi6skjGCU8rYmaWKZJA/knSRcDt7P48kF/XPSqrST6ZuTvLzBqtSAI5\nL/38ZK4sgNfULxwzMxsvar6RMCKmDfN6KXlIels1IVpZ7m4zsyqVvhN9GFfUcV9WkJOFmTVaPRPI\nSM/9sCaauvAuJxczq0Q9E4inezczayNFBtGtxbmlYWaNVM8WyIY67svMzFpcoRaIpDez51QmN6af\n765rZGZm1tKKzIX1DeC/AKt5eSqTAG6sIC6rgOfQMrN6KtIC6QJmRIQHy83MrNAYyBPAf64qEDMz\nG1+KJJAjgLWSvitp+eBrtAqSrpO0WdITubLLJPVLWp1e78i9t0jSOklPSTotVz5bUl9670uSfM+J\nmVmTFenCuqzE/q8HrmLPcZIvRsQX8gWSZgDzgWOAVwH3SXptROwCrgYuBB4G7gbmAveUiMfMzOqk\nyFxYDwI/JXu07UHAk6lstDrfB2qdrXce0BMROyJiPbAOOEHSZODgiFiRxl9uBM6oNW4zM6tGzQlE\n0lnASuC9wFnAw5LOLHncD0t6PHVxHZbKpgC/yG3zbCqbkpaHlltJvuHQzOpBtV5UJekx4G0RsTmt\nHwncFxFvGKPeVODOiDg2rXcCvyK7BPjvgMkR8UFJVwErIuKmtN21ZN1UG4DLI+LUVH4ycGlEnD7C\n8RYACwA6Oztn9/T01PTvG2pgYID1W1rvwYudk2DTb/d+PzOnHLL3O8kZGBigo6OjrvusB8dVjOMq\nZiLGNWfOnFUR0VXLtkXGQPYZTB7J85S4kz0iNg0uS7oGuDOt9gNH5zY9KpX1p+Wh5SPtfymwFKCr\nqyu6u7uLhghAb28vSx7aXqpulS6ZuZMlfXWYgaYv+7fV656Q3t5eyp7rKjmuYhxXMe0eV5EE8M/p\nCqzzJZ0P3EU2oF1IGtMY9C6yy4MBlgPzJR0gaRowHVgZERuBrZJOTFdfnQvcUfS4ZmZWXzX/KRsR\nn5T0HuAtqWhpRNw+Wh1J3wK6gSMkPQt8FuiWNIusC2sD8Ndp/2skLQPWAjuBi9MVWAAXkV3RNYms\nW8tXYJmZNVmhvpCIuBW4tcD2Zw9TfO0o2y8GFg9T/ghwbK3HNTOz6o3ZhSXpofRzm6Studc2SVur\nD9Gq5CuyzKysMVsgEXFS+nlQ9eGYmdl4UeQ+kG/UUmZmZu2hyFVYx+RXJO0HzK5vONYMwz033V1b\nZjaWWsZAFknaBrw+P/4BbMKX05qZta0xE0hE/H0a//h8RBycXgdFxCsjYlEDYjQzsxZUpAvrTkkH\nAkj6H5KulPTqiuKyJnC3lZkVUSSBXA38RtIbgEuAn+HH2ZqZta0iCWRnmk59HnBVRHyZbFp3MzNr\nQ0XuRN8maRHwfuBkSfsA+1cTljVLvhtrcLleEy6a2cRSpAXyPmAH8MGIeI5sVtzPVxKVmZm1vCJP\nJHyObB6sA1LRr4BRJ1M0M7OJq8id6BcC3wG+loqmAP+viqCsdflKLTMbVKQL62Kyqdy3AkTE08Af\nVxGUtRYnDTMbTpEEsiMifje4kqYyqe15uDbh9fVvaXYIZtZgRRLIg5I+BUyS9Dbg28A/VROWtbLh\n5s4ys/ZTJIEsBH4J9JE9RfBu4H9VEZSZmbW+Io+0/QNwTXqZmVmbqzmBSFrPMGMeEfGaukZk48pg\nV9YlM5sciJk1XJE70btyy68A3gscXt9wrFV5zMPMhipyI+HzuVd/RPxfwHNcmJm1qSJdWMfnVvch\na5EUacHYBONWiVl7K5IAlvDyGMhOYANZN5YZ4MkXzdpNkQRyJ1kCUVoP4HQpW42IK+sbmpmZtbIi\n94HMBv4GmAy8CvgQcDzZM0GGfS6IpOskbZb0RK7scEn3Sno6/Tws994iSeskPSXptFz5bEl96b0v\naTBrWcvyzYZmE1+RBHIUcHxEfCIiLiFLKH8SEZ+LiM+NUOd6YO6QsoXA/RExHbg/rSNpBjAfOCbV\n+YqkfVOdq4ELgenpNXSfZmbWYEUSSCfwu9z671LZiCLi+8CvhxTPA25IyzcAZ+TKeyJiR0SsB9YB\nJ0iaDBwcESvSExFvzNUxM7MmKTIGciOwUtLgM0DOIGthFNUZERvT8nO8nISmACty2z2byn6floeW\nW4ty15VZe1D2R32NG2eX8p6cVr8fET+poc5U4M6IODatvxgRh+befyEiDpN0FbAiIm5K5dcC95Bd\n7XV5RJyayk8GLo2I00c43gJgAUBnZ+fsnp6emv99eQMDA6zfsqtU3Sp1ToJNv212FHuqJa6ZUw5p\nTDA5AwMDdHR0NPy4Y3FcxTiuYvYmrjlz5qyKiK6xtyx4H0dEPAo8Wiqql22SNDkiNqbuqc2pvB84\nOrfdUamsPy0PLR8pxqXAUoCurq7o7u4uFWRvby9LHtpeqm6VLpm5kyV9rXf7TS1xbTinuzHB5PT2\n9lL2M1Alx1WM4yqmUXEVGQOpl+XAeWn5POCOXPl8SQdImkY2WL4ydXdtlXRiuvrq3FwdG0d8ZZbZ\nxFLpn7KSvgV0A0dIehb4LHA5sEzSBcAzwFkAEbFG0jJgLdmNihdHxGAf0kVk4y2TyLq17qkybjMz\nG1ulCSQizh7hrVNG2H4xsHiY8keAY+sYmjWZ71o3G/+a0YVlZmYTgBOINdzQcRCPjZiNT04g1lRO\nHGbjlxOImZmV4gRiZmalOIGYmVkpTiBmZlZK682JYW1r6IB6/h4R3zdi1nrcAjEzs1KcQKxl+f4Q\ns9bmBGItL59EnFDMWocTiJmZleIEYuOOu7bMWoMTiJmZleIEYmZmpTiB2LjlwXWz5vKNhDauOXGY\nNY9bIGZmVooTiJmZleIuLJsw8vNlDS5fP/fAZoZkNqG5BWITjsdFzBrDLRCb0Pr6t3D+MAnFs/qa\n7T23QKytubViVp4TiJmZldK0LixJG4BtwC5gZ0R0STocuAWYCmwAzoqIF9L2i4AL0vYfiYjvNiFs\nmyDc8jDbe81ugcyJiFkR0ZXWFwL3R8R04P60jqQZwHzgGGAu8BVJ+zYjYJu4nFTMiml2AhlqHnBD\nWr4BOCNX3hMROyJiPbAOOKEJ8dkENNbsvk4sZsNTRDTnwNJ6YAtZl9TXImKppBcj4tD0voAXIuJQ\nSVcBKyLipvTetcA9EfGdYfa7AFgA0NnZObunp6dUfAMDA6zfsqtU3Sp1ToJNv212FHuaaHHNnHLI\nS8t9/Vt2W6+HgYEBOjo66rrPenBcxUzEuObMmbMq1ys0qmZexntSRPRL+mPgXkk/zb8ZESGpcHaL\niKXAUoCurq7o7u4uFVxvby9LHtpeqm6VLpm5kyV9rXf19USLa8M53bmWx35sOKe7rnH19vZS9rNZ\nJcdVTLvH1bQurIjoTz83A7eTdUltkjQZIP3cnDbvB47OVT8qlZlVwt1WZmNrSgKRdKCkgwaXgb8E\nngCWA+elzc4D7kjLy4H5kg6QNA2YDqxsbNTWzgbHSYYmFj8d0dpZs/ocOoHbs2EO9gO+GRH/LOnH\nwDJJFwDPAGcBRMQaScuAtcBO4OKIaL0BCmsLYyWMqQvv8p3u1haakkAi4ufAG4Ypfx44ZYQ6i4HF\nFYdmZmY1arXLeM3GpbEuA3Y3l01ErXfZjNk4Ndwjdt2VZROZE4hZhUZ6bvtYzynxOIqNB+7CMmuC\nvv4t7taycc8tELMmchKx8cwtELMWNdJ9J2atwi0Qs3Emn0Q8TmLN5ARi1uJqaXU4qVgzOIGYjWMj\nJZeRLiP25cVWT04gZhPMSJcOO2lYvXkQ3axNFB2A9x30NhYnELM2NLSVMrg+eH/KSK0YJ5TW18jf\nkbuwzNrUcJcIXzKz9nruEjMnEDMb03B/1Q4dX3FiGd1EnJ7GXVhmtteG6xKr502QtUz94jGbxnML\nxMwqM/Sv7nwrZbgv+8Fta/lr3cmi+ZxAzKxSY3V/1VJ/cGym1nruTmsMJxAzaxn1uOKrln2MlFiq\nGKeoZ0spH18r3OPjBGJmbWmk7rShX/jDdcENLR+63+G+5Ic79mj7GSvuVuAEYmZtp0hLZ7QkMPhg\nsKJdayOVjTT1TJn9NoITiJlZSX39Wzh/L8d4Rqo3HsZvfBmvmVkLaqWuqpE4gZiZWSnjKoFImivp\nKUnrJC1sdjxmZu1s3CQQSfsCXwbeDswAzpY0o7lRmZm1r3GTQIATgHUR8fOI+B3QA8xrckxmZm1r\nPCWQKcAvcuvPpjIzM2sCRUSzY6iJpDOBuRHxV2n9/cCbIuJvh2y3AFiQVl8HPFXykEcAvypZt0qO\nqxjHVYzjKmYixvXqiDiylg3H030g/cDRufWjUtluImIpsHRvDybpkYjo2tv91JvjKsZxFeO4imn3\nuMZTF9aPgemSpkn6I2A+sLzJMZmZta1x0wKJiJ2S/hb4LrAvcF1ErGlyWGZmbWvcJBCAiLgbuLtB\nh9vrbrCKOK5iHFcxjquYto5r3Ayim5lZaxlPYyBmZtZCnECGaJXpUiQdLekBSWslrZH00VR+maR+\nSavT6x1NiG2DpL50/EdS2eGS7pX0dPp5WINjel3unKyWtFXSx5p1viRdJ2mzpCdyZSOeI0mL0mfu\nKUmnNTiuz0v6qaTHJd0u6dBUPlXSb3Pn7qsNjmvE312Tz9ctuZg2SFqdyhtyvkb5bmj85ysi/Eov\nssH5nwGvAf4IeAyY0aRYJgPHp+WDgH8jm8LlMuATTT5PG4AjhpT9H2BhWl4IXNHk3+NzwKubdb6A\nvwCOB54Y6xyl3+tjwAHAtPQZ3LeBcf0lsF9aviIX19T8dk04X8P+7pp9voa8vwT4TCPP1yjfDQ3/\nfLkFsruWmS4lIjZGxKNpeRvwJK195/084Ia0fANwRhNjOQX4WUQ806wAIuL7wK+HFI90juYBPRGx\nIyLWA+vIPosNiSsivhcRO9PqCrJ7rBpqhPM1kqaer0GSBJwFfKuKY48S00jfDQ3/fDmB7K4lp0uR\nNBU4Dng4FX04dTdc1+iuoiSA+yStSnf+A3RGxMa0/BzQ2YS4Bs1n9//UzT5fg0Y6R630ufsgcE9u\nfVrqjnlQ0slNiGe4312rnK+TgU0R8XSurKHna8h3Q8M/X04gLU5SB3Ar8LGI2ApcTdbFNgvYSNaE\nbrSTImIW2czIF0v6i/ybkbWbm3J5n7KbTN8JfDsVtcL52kMzz9FIJH0a2AncnIo2An+SftcfB74p\n6eAGhtSSv7ucs9n9D5WGnq9hvhte0qjPlxPI7mqaLqVRJO1P9gG5OSJuA4iITRGxKyL+AFxDRU33\n0UREf/q5Gbg9xbBJ0uQU92Rgc6PjSt4OPBoRm1KMTT9fOSOdo6Z/7iSdD5wOnJO+fEhdHs+n5VVk\nfeevbVRMo/zuWuF87Qe8G7hlsKyR52u47waa8PlyAtldy0yXkvpXrwWejIgrc+WTc5u9C3hiaN2K\n4zpQ0kGDy2QDsE+Qnafz0mbnAXc0Mq6c3f4qbPb5GmKkc7QcmC/pAEnTgOnAykYFJWku8D+Bd0bE\nb3LlRyp7Dg+SXpPi+nkD4xrpd9fU85WcCvw0Ip4dLGjU+Rrpu4FmfL6qvmJgvL2Ad5Bd1fAz4NNN\njOMksibo48Dq9HoH8A2gL5UvByY3OK7XkF3R8RiwZvAcAa8E7geeBu4DDm/COTsQeB44JFfWlPNF\nlsQ2Ar8n63O+YLRzBHw6feaeAt7e4LjWkfWRD37Ovpq2fU/6Ha8GHgX+e4PjGvF318zzlcqvBz40\nZNuGnK9Rvhsa/vnynehmZlaKu7DMzKwUJxAzMyvFCcTMzEpxAjEzs1KcQMzMrBQnEDMzK8UJxGwI\nSedLuqqCfb5qL+rP0hhT0VcRt9lonEDMGuN8oHQCIZsPquHPfjEbjROItQVJn5T0kbT8RUn/kpbf\nKulmSR+Q9G+SVgJvydXrVPaQpcfS682p/OOSnkivj6WyqZKelHRNetDP9yRNknQm0AXcnGZqnSRp\ndpqxdZWk7+bmMOqVdIWklSmek9O0Ov8beF+q/74a/r1HSrpV0o/T6y2p/LI0s22vpJ8PnhOzMpxA\nrF38gGz6bci+zDvShHQnk01d8zmyxHES2QN4Bn0JeDAi3kD2YKE1kmYDHwDeBJwIXCjpuLT9dODL\nEXEM8CLwnoj4DvAI2USFs8hmvP1H4MyImA1cByzOHXO/iDgB+Bjw2cieTfMZ4JaImBURtzC2fwC+\nGBFvJJti4+u59/4MOI1scsLPpvNgVth+zQ7ArEFWAbPT9No7yOYq6iJLIP8C9EbELyF7ZCkvz6L6\nVuBcgIjYBWyRdBJwe0RsT9vflvazHFgfEatzx5w6TCyvA44F7s3mxWNfsvmWBg3OrjpS/VqcCsxI\n+wc4OE3/DXBXROwAdkjaTPbciGeH2YfZqJxArC1ExO8lrScbi/gh2UR0c4A/Bb5M9ld5PezILe8C\nJg2zjYA1EfHnY+xjF+X/j+4DnBgR/7HbgbOEMjRGfw9YKe7CsnbyA+ATwPfT8oeAn5A9xvW/Snpl\n6s55b67O/cDfAEjaV9Ihqe4Zkv5TmtL+XalsNNvInl8N2YyoR0r687Tf/SUdU6B+Lb4HfHhwRdKs\nAnXNauIEYu3kB8Bk4EeRPXDqP4AfRPYY0MuAHwH/SvaM6UEfBeZI6iPrUpoR2fOoryd7psLDwNcj\n4idjHPt64KuSVpN1WZ0JXCHpMbLpuN88Rv0HyLqkahpEBz4CdCl7HOxasmRpVleezt3MzEpxC8TM\nzErx4JnZOCPpA2Rda3n/GhEXNyMea1/uwjIzs1LchWVmZqU4gZiZWSlOIGZmVooTiJmZleIEYmZm\npfx/1eqfF2XrQj8AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fb118b9d150>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "wdcontent_len = df_eval_content.wdcontent_len.values\n",
    "max_len = max(wdcontent_len)\n",
    "min_len = min(wdcontent_len)\n",
    "mean_len = np.mean(wdcontent_len)\n",
    "median_len = np.median(wdcontent_len)\n",
    "print '** content 长度（词数）'\n",
    "print 'max = %d， min = %d, mean = %d, median = %d' % (max_len, min_len, mean_len, median_len)\n",
    "\n",
    "plt.hist(wdcontent_len, bins = 200, range=(0, 200))\n",
    "plt.xlabel('wdcontent_len')\n",
    "plt.ylabel('question_number')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "--------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练集数据分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%time df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\\t', names=['question_id', 'ch_title', 'word_title', 'ch_content', 'word_content'],dtype={'question_id': object})\n",
    "print '训练集问题数量 \\033[1;35m %d \\033[0m ' % len(df_train)\n",
    "df_train.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 26.4 s, sys: 1.58 s, total: 28 s\n",
      "Wall time: 36.6 s\n",
      "训练集问题数量 \u001b[1;35m 2999967 \u001b[0m \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word_title</th>\n",
       "      <th>word_content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...</td>\n",
       "      <td>w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...</td>\n",
       "      <td>w12508,w1380,w72,w27045,w276,w111</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          word_title  \\\n",
       "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...   \n",
       "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...   \n",
       "\n",
       "                                        word_content  \n",
       "0  w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1...  \n",
       "1                  w12508,w1380,w72,w27045,w276,w111  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\\t', usecols=[2, 4], names=[ 'word_title','word_content'],dtype={'question_id': object})\n",
    "print '训练集问题数量 \\033[1;35m %d \\033[0m ' % len(df_train)\n",
    "df_train.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "一共有213580879个词；词典大小为570782\n",
      "time costed 103.478s\n"
     ]
    }
   ],
   "source": [
    "from itertools import chain\n",
    "\n",
    "# 统计所有的词数\n",
    "def split_words(ss):\n",
    "    if type(ss) is float:\n",
    "        return []\n",
    "    return ss.split(',')\n",
    "time0 = time.time()\n",
    "words_content = map(split_words, df_train.word_content.values)\n",
    "words_title = map(split_words, df_train.word_title.values)\n",
    "words_content = list(chain(*words_content))\n",
    "words_title = list(chain(*words_title))\n",
    "words = words_content + words_title\n",
    "sr_words = pd.Series(words)\n",
    "words_count = sr_words.value_counts()\n",
    "print('一共有%d个词；词典大小为%d' %(len(words), len(words_count)))\n",
    "print('time costed %gs' % (time.time()-time0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17107\n"
     ]
    }
   ],
   "source": [
    "print(words_count['w1111'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2999967\n",
      "2999967\n"
     ]
    }
   ],
   "source": [
    "questions2 = df_train.question_id.values\n",
    "print sum(questions1 == questions2)\n",
    "print len(questions1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "没有内容的问题 \u001b[1;35m 765049 \u001b[0m  \n",
      "一共有 0.255019 没有问题描述\n"
     ]
    }
   ],
   "source": [
    "na_count = 0\n",
    "ch_contents = df_train.ch_content.values\n",
    "for ch_content in ch_contents:\n",
    "    if type(ch_content) is float:\n",
    "        na_count += 1\n",
    "print '没有内容的问题 \\033[1;35m %d \\033[0m  ' % na_count\n",
    "print '一共有 %g 没有问题描述' % (na_count / len(df_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "328877\n",
      "422123\n",
      "633584\n",
      "768738\n",
      "818616\n",
      "876828\n",
      "1273673\n",
      "1527297\n",
      "1636237\n",
      "1682969\n",
      "2052477\n",
      "2628516\n",
      "2657464\n",
      "2904162\n",
      "2993517\n",
      "没有题目的问题数量 \u001b[1;35m 15 \u001b[0m  \n"
     ]
    }
   ],
   "source": [
    "word_titles = df_train.word_title.values\n",
    "\n",
    "na_count = 0\n",
    "na_indexs = list()\n",
    "for i in xrange(len(word_titles)):\n",
    "    word_title = word_titles[i]\n",
    "    if type(word_title) is float:\n",
    "        na_indexs.append(i)\n",
    "        na_count += 1\n",
    "        print i\n",
    "        \n",
    "print '没有题目的问题数量 \\033[1;35m %d \\033[0m  ' % na_count\n",
    "\n",
    "# for na_index in na_indexs:\n",
    "#     df_train.loc[na_index, 'word_title'] = df_train.loc[na_index, 'ch_title']\n",
    "df_train = df_train.drop(na_indexs) # 将没有title的数据直接丢弃"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "** content 长度（词数）\n",
      "max = 187， min = 1, mean = 12, median = 11\n"
     ]
    }
   ],
   "source": [
    "df_train['word_title'] = df_train.word_title.apply(lambda text: text.split(','))\n",
    "df_train['ch_title'] = df_train.ch_title.apply(lambda text: text.split(','))\n",
    "df_train['wdtitle_len'] = df_train.word_title.apply(lambda ws: len(ws))\n",
    "df_train['chtitle_len'] = df_train.ch_title.apply(lambda chs: len(chs))\n",
    "\n",
    "wdtitle_len = df_train.wdtitle_len.values\n",
    "max_len = max(wdtitle_len)\n",
    "min_len = min(wdtitle_len)\n",
    "mean_len = np.mean(wdtitle_len)\n",
    "median_len = np.median(wdtitle_len)\n",
    "print '** content 长度（词数）'\n",
    "print 'max = %d， min = %d, mean = %d, median = %d' % (max_len, min_len, mean_len, median_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZwAAAEXCAYAAACZNvIiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xu4HFWZ7/Hvj4uQEwETgpsQkICAc4AoyiYwoDNbUYii\nBj2AQYSgSPSAPPoYL6DMgDCZARVwEIUTICcBEYggD+EmJ1w2ikpCgozhIiZKMiQEIknMBSSyw3v+\nqLWlaPet9u6q3un8Ps/TT1evqlX9VqXT716rVq9SRGBmZla2LRodgJmZbR6ccMzMrBJOOGZmVgkn\nHDMzq4QTjpmZVcIJx8zMKuGEY5ssSVdI+pdGxzEQktZL2rPRcfSXpHMl/ajRcdimwQnHGkLSYknv\nH8g+IuLzEXF+vWIqm6R2SZ/Nl0XEGyPij42KyaxKTjg2KEnaqtEx2GuU8feFDYg/QFY5SdcCbwFu\nS11KX5M0WlJIOkXSfwP3pW1/Iuk5SWsk/VzSfrn9TJf0b2m5TdJSSZMlrZC0XNKne4hhD0kPSFon\nabakyzq7hjr3VbP931pkkraQdKakP0haKWmmpOFp3baSfpTK/yzpYUktkqYA7wEuS8d8Wdo+JO2V\nlneQdI2kP0laIunszi95SSdLelDSdyWtlvS0pA92c2yflnRb7vVCST/JvX5G0gFp+dAU45r0fGhu\nu3ZJUyT9EngJ2LP2vAEjctt3eezdxLhY0lck/Ta9942Sts0fa832+fM0XdIPJd2VzuUvJe0s6Xvp\n3PxO0ju7+7e3xnHCscpFxInAfwMfSV1K386t/mfgfwJHptd3AXsDbwYeAa7rYdc7AzsAo4BTgB9I\nGtbNtj8G5pN9YZ4PTCxwCGcAR6dYdwFWAz9I6yamGHYDdgQ+D/wlIr4J/AL4QjrmL3Sx3++nunum\nfZ8E5JPmwcBTKeZvA1dLUhf7eQB4T0qMuwBvAP4RIF0veiPw25Qk7wAuTbFeDNwhacfcvk4EJgHb\nAUvo+bx1eexdxNfpOGAcsAfwduDkHrbtqu7ZKY4NwK/JPh8jgJvSsdgg44Rjg825EfFiRPwFICKm\nRcS6iNgAnAu8Q9IO3dR9BTgvIl6JiDuB9cDbajeS9BbgIOBfImJDRPwcuK12ux58HvhmRCzNxXVM\n6gZ8hezLdq+I2BgR8yNibW87lLQlMAE4Kx3vYuAisi/8Tksi4sqI2AjMAEYCf9eCSNeE1gEHAP8E\n3A08K+kfyBLZLyLiVeAoYGFEXBsRHRFxPfA74CO53U2PiMcjoiO9X0/nreixXxoRz0bEqrSfA3o7\nTzm3pP2/DNwCvBwR16RzcyPgFs4g5IRjg80znQuStpR0Qeq6WgssTqtGdFkTVqYvxk4vkf01X2sX\nYHVEvJgrW1Igxt2BW1K30Z+BJ4GNZF/+15J9wd8g6VlJ35a0dR/2OQLYuiaOJWSttU7PdS5ExEtp\nsavjg6yV00aWcB4A2smSzT+n15Cdh9rjrn3PZ3LLvZ23osf+XG65u3+r7jyfW/5LF6+L7Msq4oRj\njdLdNOX58k8C44H3k3XVjE7lXXUjFbEcGCZpaK7sLbnlF4H/0fkitT52yq1/BvhgRLwp99g2Ipal\n1tW3ImJf4FDgw2RdY7XHVusFshbC7jUxLSt6cElnwnlPWn6Av084z9a8X1fvmY+5x/PWy7EXUXv+\nd+7HPmwQcsKxRnme7FpFT7Yj659fSfYF9O/1eOOIWALMA74l6Q2S3s3ru5F+D2wr6aj0F/rZwDa5\n9VcAUyTtDiBpJ0nj0/J7JY1JSWotWRJ5NdXr9phTV9DMtN/t0r6/DPT3Ny4PAO8FhkTEUrLrR+PI\nurx+k7a5E9hH0iclbSXpE8C+wO3dxNjjeevl2Iv4L2A/SQekgQTn9mMfNgg54Vij/AdwduqW+ko3\n21xD1mWzDHgCeKiO7/9Jsovwq4Bz0nsBEBFrgNOAq9J7vwjkR639JzAL+H+S1qW4Dk7rdia7aL2W\nrKvtAbKups56x6SRVJd2EdMZ6b3+CDxIdoF+Wn8OLiJ+T3YN6xfp9dq031+m5EZErCRrhUwmS+pf\nAz4cES/0sOtuzxs9H3vR2M8D7gEWkp0LawLyDdjMsl/Mk13s/lSjYzFrVm7hmJlZJZxwzMysEu5S\nMzOzSriFY2ZmlfAEiTkjRoyI0aNH97v+iy++yNChQ3vfsGKOqxjHVYzjKqYZ45o/f/4LEbFTrxtG\nhB/pceCBB8ZA3H///QOqXxbHVYzjKsZxFdOMcQHzog/fse5SMzOzSjjhmJlZJZxwzMysEk44ZmZW\nCSccMzOrhBOOmZlVwgnHzMwq4YRjZmaVcMIxM7NKeGob67fRZ97R4/rFFxxVUSRmtilwwrHS9JaQ\npo8bfPNJmVl5Su1Sk7SbpPslPSHpcUlfTOXnSlom6dH0+FCuzlmSFkl6StKRufIDJS1I6y6VpFS+\njaQbU/kcSaNzdSZKWpgeE8s8VjMz61nZLZwOYHJEPCJpO2C+pNlp3SUR8d38xpL2BSYA+wG7APdI\n2ieye7BfDpwKzAHuBMYBdwGnAKsjYi9JE4ALgU9IGk52z/VWINJ7z4qI1SUfs5mZdaHUFk5ELI+I\nR9LyOuBJYFQPVcYDN0TEhoh4GlgEjJU0Etg+Ih5KM5NeAxydqzMjLd8EHJ5aP0cCsyNiVUoys8mS\nlJmZNUBl13BSV9c7yVoohwFnSDoJmEfWClpNloweylVbmspeScu15aTnZwAiokPSGmDHfHkXdfJx\nTQImAbS0tNDe3t7vY1y/fv2A6pelrLgmj+kYUP3N7XwNlOMqxnEVU0VclSQcSW8Ebga+FBFrJV0O\nnE/W1XU+cBHwmSpiqRURU4GpAK2trdHW1tbvfbW3tzOQ+mUpK66TexkU0Jvp44ZuVudroBxXMY6r\nmCriKv13OJK2Jks210XETwEi4vmI2BgRrwJXAmPT5suA3XLVd01ly9Jybfnr6kjaCtgBWNnDvszM\nrAHKHqUm4GrgyYi4OFc+MrfZx4DH0vIsYEIaebYHsDcwNyKWA2slHZL2eRJwa65O5wi0Y4D70nWe\nu4EjJA2TNAw4IpWZmVkDlN2ldhhwIrBA0qOp7BvA8ZIOIOtSWwx8DiAiHpc0E3iCbITb6WmEGsBp\nwHRgCNnotLtS+dXAtZIWAavIRrkREasknQ88nLY7LyJWlXScZmbWi1ITTkQ8CKiLVXf2UGcKMKWL\n8nnA/l2Uvwwc282+pgHT+hqvmZmVx3OpmZlZJTy1jTXMgmVruh3p5nnYzJqPWzhmZlYJJxwzM6uE\nE46ZmVXCCcfMzCrhhGNmZpVwwjEzs0o44ZiZWSWccMzMrBL+4af1aPQAb0FgZtbJLRwzM6uEE46Z\nmVXCCcfMzCrhhGNmZpVwwjEzs0o44ZiZWSWccMzMrBJOOGZmVgknHDMzq4RnGrBBqbcZDnwLarNN\nj1s4ZmZWCSccMzOrhBOOmZlVwgnHzMwq4YRjZmaVcMIxM7NKOOGYmVklnHDMzKwSTjhmZlYJJxwz\nM6tEqQlH0m6S7pf0hKTHJX0xlQ+XNFvSwvQ8LFfnLEmLJD0l6chc+YGSFqR1l0pSKt9G0o2pfI6k\n0bk6E9N7LJQ0scxjNTOznpXdwukAJkfEvsAhwOmS9gXOBO6NiL2Be9Nr0roJwH7AOOCHkrZM+7oc\nOBXYOz3GpfJTgNURsRdwCXBh2tdw4BzgYGAscE4+sZmZWbVKTTgRsTwiHknL64AngVHAeGBG2mwG\ncHRaHg/cEBEbIuJpYBEwVtJIYPuIeCgiArimpk7nvm4CDk+tnyOB2RGxKiJWA7N5LUmZmVnFKruG\nk7q63gnMAVoiYnla9RzQkpZHAc/kqi1NZaPScm356+pERAewBtixh32ZmVkDVHJ7AklvBG4GvhQR\na9PlFwAiIiRFFXF0E9skYBJAS0sL7e3t/d7X+vXrB1S/LAOJa/KYjvoGk9MypP/7L/M8N+O/Y5kc\nVzGbc1ylJxxJW5Mlm+si4qep+HlJIyNieeouW5HKlwG75arvmsqWpeXa8nydpZK2AnYAVqbytpo6\n7bXxRcRUYCpAa2trtLW11W7SZ+3t7QykflkGEtfJvdyXZiAmj+ngogX9+wguPqGtvsHkNOO/Y5kc\nVzGbc1xlj1ITcDXwZERcnFs1C+gcNTYRuDVXPiGNPNuDbHDA3NT9tlbSIWmfJ9XU6dzXMcB96TrP\n3cARkoalwQJHpDIzM2uAsls4hwEnAgskPZrKvgFcAMyUdAqwBDgOICIelzQTeIJshNvpEbEx1TsN\nmA4MAe5KD8gS2rWSFgGryEa5ERGrJJ0PPJy2Oy8iVpV1oGZm1rNSE05EPAiom9WHd1NnCjCli/J5\nwP5dlL8MHNvNvqYB0/oar5mZlcczDZiZWSUqGaVmVm+jexjMsPiCoyqMxMz6yi0cMzOrhBOOmZlV\nwgnHzMwq0aeEo8xuvW9pZmbWtT4lnPRDyjtLjsXMzJpYkS61RyQdVFokZmbW1IoMiz4YOEHSEuBF\nsh90RkS8vZTIzMysqRRJOEf2vomZmVnX+tylFhFLyGZlfl9afqlIfTMz27z1uYUj6RygFXgb8H+B\nrYEfkU3QaZuonn6xb2ZWT0VaKB8DPkp2/YaIeBbYroygzMys+RRJOH9Nw6MDQNLQckIyM7NmVCTh\nzJT0f4A3SToVuAe4spywzMys2fT5Gk5EfFfSB4C1wD7Av0bE7NIiMzOzplL09gQLyO64GWnZzMys\nT/rcpSbps8Bc4OPAMcBDkj5TVmBmZtZcirRwvgq8MyJWAkjaEfgVvoWzmZn1QZFBAyuBdbnX61KZ\nmZlZr3pt4Uj6clpcBMyRdCvZNZzxwG9LjM3MzJpIX7rUOn/c+Yf06HRr/cMxG7jeZk9YfMFRFUVi\nZnm9JpyI+FYVgZiZWXMrMpdaK/BNYPd8Pd+ewMzM+qLIKLXryEaqLQBeLSccMzNrVkUSzp8iYlZp\nkZiZWVMrknDOkXQVcC+wobMwIn5a96jMzKzpFEk4nwb+gew+OJ1dagE44ZiZWa+KJJyDIuJtpUVi\nZmZNrchMA7+StG9pkZiZWVMr0sI5BHhU0tNk13AEhIdFm5lZXxRJOONKi8LMzJpekS616ObRLUnT\nJK2Q9Fiu7FxJyyQ9mh4fyq07S9IiSU9JOjJXfqCkBWndpZKUyreRdGMqnyNpdK7OREkL02NigeM0\nM7MSFGnh3EGWYARsC+wBPAXs10Od6cBlwDU15ZdExHfzBen60IS0v12AeyTtExEbgcuBU4E5wJ1k\nra27gFOA1RGxl6QJwIXAJyQNB84BWlPM8yXNiojVBY7XzMzqqM8tnIgYExFvT897A2OBX/dS5+fA\nqj6+xXjghojYEBFPk81OPVbSSGD7iHgoIoIseR2dqzMjLd8EHJ5aP0cCsyNiVUoys3GXoJlZQxW9\nxfTfRMQjkg7uZ/UzJJ0EzAMmp6QwCngot83SVPZKWq4tJz0/k+LpkLQG2DFf3kWd15E0CZgE0NLS\nQnt7ez8PCdavXz+g+mXpKa7JYzqqDSanZUhj3r+3f6NN8d+xkRxXMZtzXEUm7/xy7uUWwLuAZ/vx\nnpcD55N1dZ0PXAQ07FbVETEVmArQ2toabW1t/d5Xe3s7A6lflp7iOrmXqfzLNHlMBxct6PffPP22\n+IS2Htdviv+OjeS4itmc4yoyaGC73GMbsms644u+YUQ8HxEbI+JV4EqyrjmAZcBuuU13TWXL0nJt\n+evqSNoK2IHsLqTd7cvMzBqkz39e1uu+OJJGRsTy9PJjQOcItlnAjyVdTDZoYG9gbkRslLRW0iFk\ngwZOAr6fqzOR7FrSMcB9ERGS7gb+XdKwtN0RwFn1iN/MzPqnSJfaPsBXgNG8/n447+uhzvVAGzBC\n0lKykWNtkg4g61JbDHwu7edxSTOBJ4AO4PQ0Qg3gNLIRb0PIRqfdlcqvBq6VtIhscMKEtK9Vks4H\nHk7bnRcRfR28YE2utzuCTh83tKJIzDYvRTrQfwJcAVwFbOxlWwAi4vguiq/uYfspwJQuyucB+3dR\n/jJwbDf7mgZM60ucZmZWviIJpyMiLi8tEjMza2pFBg3cJuk0SSMlDe98lBaZmZk1lSItnM7pYb6a\nKwtgz/qFY2ZmzarIKLU9elov6QMRMXvgIZmZWTMq0qXWmwvruC8zM2sy9Uw4quO+zMysydQz4fR4\nqwIzM9u81TPhmJmZdaueCWdxHfdlZmZNptBUvZIO5e+ntrkmPX+8rpGZmVlTKTKX2rXAW4FHeW1q\nm84bopmZmfWoSAunFdg33XXTzMyskCLXcB4Ddi4rEDMza25FWjgjgCckzQU2dBZGxEfrHpWZmTWd\nIgnn3LKCMDOz5ldkLrUHJLUAB6WiuRGxopywzBpnwbI1nNzNTdoWX3BUxdGYNY8+X8ORdBwwl+yG\nZ8cBcyQdU1ZgZmbWXIp0qX0TOKizVSNpJ+Ae4KYyAjMzs+ZSZJTaFjVdaCsL1jczs81YkRbOzyTd\nDVyfXn8CuLP+IZmZWTMqMmjgq5L+F3BYKpoaEbeUE5aZmTWbQnOpRcTNwM0lxWJmZk2s14Qj6cGI\neLekdbz+njcCIiK2Ly06MzNrGr0mnIh4d3rervxwzMysWRWaLToiTuytzAafnn7IaGZWlSLDmvfL\nv5C0FXBgfcMxM7Nm1WvCkXRWun7zdklr02Md8Dxwa+kRmplZU+g14UTEf6TrN9+JiO3TY7uI2DEi\nzqogRjMzawJFutRulzQUQNKnJF0safeS4jIzsyZTJOFcDrwk6R3AZOAP+PbSZmbWR0V++NkRESFp\nPHBZRFwt6ZSyAjMbjEb3MtrPty8w616RFs46SWcBJwJ3SNoC2LqnCpKmSVoh6bFc2XBJsyUtTM/D\ncuvOkrRI0lOSjsyVHyhpQVp3qSSl8m0k3ZjK50ganaszMb3HQkkTCxynmZmVoEjC+QTZraU/ExHP\nAbsC3+mlznRgXE3ZmcC9EbE3cG96jaR9gQlkw6/HAT+UtGWqczlwKrB3enTu8xRgdUTsBVwCXJj2\nNRw4BzgYGAuck09sZmZWvT4nnJRkbga2SUUvAD1O3hkRPwdW1RSPB2ak5RnA0bnyGyJiQ0Q8DSwC\nxkoaCWwfEQ9FRJBdNzq6i33dBByeWj9HArMjYlVErAZm8/eJz8zMKlRkpoFTgUnAcOCtwCjgCuDw\ngu/ZEhHL0/JzQEtaHgU8lNtuaSp7JS3XlnfWeQYgIjokrQF2zJd3Uaf2uCal46KlpYX29vaCh/Oa\n9evXD6h+WVqGwOQxHY0O4+80Y1xl/vsP1s+X4ypmc46ryKCB08m6p+YARMRCSW8eyJunQQjR+5bl\niYipwFSA1tbWaGtr6/e+2tvbGUj9snz/ulu5aEGhicErMXlMR9PFtfiEtvoGkzNYP1+Oq5jNOa4i\n13A2RMRfO1+kqW36kyyeT91kpOfOu4guA3bLbbdrKluWlmvLX1cnxbMD2Z1Iu9uXmZk1SJGE84Ck\nbwBDJH0A+AlwWz/ecxbQOWpsIq9NjzMLmJBGnu1BNjhgbup+WyvpkHR95qSaOp37Oga4L13nuRs4\nQtKwNFjgiFRmZmYNUqTf4EyyUWELgM+R3V76qp4qSLoeaANGSFpKNnLsAmBm+g3PEuA4gIh4XNJM\n4AmgAzg9IjamXZ1GNuJtCHBXegBcDVwraRHZ4IQJaV+rJJ0PPJy2Oy8iagcvmJlZhYrcYvpV4Mr0\n6Gud47tZ1eVAg4iYAkzponwesH8X5S8Dx3azr2nAtL7GamZm5SoySu1purhmExF71jUiMzNrSkW6\n1Fpzy9uStSyG1zccMzNrVkW61FbWFH1P0nzgX+sbktmmy3OtmXWvSJfau3IvtyBr8Qy+H1GYmdmg\nVCRhXMRr13A6gMV0c8HezMysVpGEcztZwlF6HcCH08TNRMTF9Q3NzMyaSZGEcyBwENmPLgV8BJgL\nLCwhLjMzazJFEs6uwLsiYh2ApHOBOyLiU2UEZmZmzaXI1DYtwF9zr//KazM9m5mZ9ahIC+caYK6k\nznvgHE023YyZmVmvivwOZ4qku4D3pKJPR8RvygnLzMyaTaHf0UTEI8AjJcViZmZNzD/cNKtQTzMR\neBYCa3ZFBg2YmZn1mxOOmZlVwgnHzMwq4YRjZmaVcMIxM7NKOOGYmVklnHDMzKwSTjhmZlYJ//DT\nbJDo7fbU08cNrSgSs3K4hWNmZpVwwjEzs0o44ZiZWSWccMzMrBJOOGZmVgmPUjPbRCxYtoaTuxnJ\n5lsb2KbALRwzM6uEE46ZmVXCCcfMzCrRsIQjabGkBZIelTQvlQ2XNFvSwvQ8LLf9WZIWSXpK0pG5\n8gPTfhZJulSSUvk2km5M5XMkja76GM3M7DWNbuG8NyIOiIjW9PpM4N6I2Bu4N71G0r7ABGA/YBzw\nQ0lbpjqXA6cCe6fHuFR+CrA6IvYCLgEurOB4zMysG4NtlNp4oC0tzwDaga+n8hsiYgPwtKRFwFhJ\ni4HtI+IhAEnXAEcDd6U656Z93QRcJkkREZUcSYV6m4Nr8piKArGG6e0z4FFsNhg0soUTwD2S5kua\nlMpaImJ5Wn4OaEnLo4BncnWXprJRabm2/HV1IqIDWAPsWO+DMDOzvmlkC+fdEbFM0puB2ZJ+l18Z\nESGp9NZISnaTAFpaWmhvb+/3vtavXz+g+v01eUxHj+tbhvS+TSM4rmIGEleZn8tGfe5747iKqSKu\nhiWciFiWnldIugUYCzwvaWRELJc0EliRNl8G7JarvmsqW5aWa8vzdZZK2grYAVjZRRxTgakAra2t\n0dbW1u9jam9vZyD1+6u7HwN2mjymg4sWDLbeU8dV1EDiWnxCW32DyWnU5743jquYKuJqSJeapKGS\ntutcBo4AHgNmARPTZhOBW9PyLGBCGnm2B9nggLmp+22tpEPS6LSTaup07usY4L5mvH5jZrapaNSf\ncS3ALWkE81bAjyPiZ5IeBmZKOgVYAhwHEBGPS5oJPAF0AKdHxMa0r9OA6cAQssECd6Xyq4Fr0wCD\nVWSj3MzMrEEaknAi4o/AO7ooXwkc3k2dKcCULsrnAft3Uf4ycOyAgzVrAh7FZoNBo3+HY2Zmmwkn\nHDMzq4QTjpmZVWLwjf00s8r1dI3H13esXtzCMTOzSjjhmJlZJdylZmY96m1I9fRxQyuKxDZ1buGY\nmVklnHDMzKwS7lIzswFZsGxNtxPIeoSb5TnhmFlpPKWO5blLzczMKuEWjpk1jFtAmxcnHDMbtDwD\nQnNxl5qZmVXCLRwz2yT5B6mbHrdwzMysEm7hmFlT8u+DBh+3cMzMrBJOOGZmVgl3qZnZZse//2kM\nt3DMzKwSbuFsAnr7a8zMbFPgFo6ZmVXCLRwzsxq+xlMOt3DMzKwSTjhmZlYJd6mZmRXkWaz7xy0c\nMzOrhBOOmZlVwl1qZmZ15NsmdM8JZ5DwjzvNrNk1fcKRNA74T2BL4KqIuKDBIZnZZqyn2yZAcw86\naOqEI2lL4AfAB4ClwMOSZkXEE42NzMysa808Aq6pEw4wFlgUEX8EkHQDMB5wwjGzTU6ZXe9VXFtS\nRJT+Jo0i6RhgXER8Nr0+ETg4Ir6Q22YSMCm9fBvw1ADecgTwwgDql8VxFeO4inFcxTRjXLtHxE69\nbdTsLZxeRcRUYGo99iVpXkS01mNf9eS4inFcxTiuYjbnuJr9dzjLgN1yr3dNZWZmVrFmTzgPA3tL\n2kPSG4AJwKwGx2Rmtllq6i61iOiQ9AXgbrJh0dMi4vES37IuXXMlcFzFOK5iHFcxm21cTT1owMzM\nBo9m71IzM7NBwgnHzMwq4YRTB5LGSXpK0iJJZzY6nk6SFktaIOlRSfMaGMc0SSskPZYrGy5ptqSF\n6XnYIInrXEnL0jl7VNKHGhDXbpLul/SEpMclfTGVN/Sc9RBXQ8+ZpG0lzZX0Xymub6XyRp+v7uJq\n+GcsxbGlpN9Iuj29Lv18+RrOAKXpc35Pbvoc4PjBMH2OpMVAa0Q09Edmkv4JWA9cExH7p7JvA6si\n4oKUpIdFxNcHQVznAusj4rtVxlIT10hgZEQ8Imk7YD5wNHAyDTxnPcR1HA08Z5IEDI2I9ZK2Bh4E\nvgh8nMaer+7iGkeDP2Mpvi8DrcD2EfHhKv5PuoUzcH+bPici/gp0Tp9jSUT8HFhVUzwemJGWZ5B9\ncVWqm7gaLiKWR8QjaXkd8CQwigafsx7iaqjIrE8vt06PoPHnq7u4Gk7SrsBRwFW54tLPlxPOwI0C\nnsm9Xsog+E+YBHCPpPlpCp/BpCUilqfl54CWRgZT4wxJv01dbpV39eVJGg28E5jDIDpnNXFBg89Z\n6h56FFgBzI6IQXG+uokLGv8Z+x7wNeDVXFnp58sJp7m9OyIOAD4InJ66kAadyPp1B8VffsDlwJ7A\nAcBy4KJGBSLpjcDNwJciYm1+XSPPWRdxNfycRcTG9FnfFRgraf+a9Q05X93E1dDzJenDwIqImN/d\nNmWdLyecgRu00+dExLL0vAK4haz7b7B4Pl0T6Lw2sKLB8QAQEc+nL4lXgStp0DlLff43A9dFxE9T\nccPPWVdxDZZzlmL5M3A/2XWShp+vruIaBOfrMOCj6RrvDcD7JP2ICs6XE87ADcrpcyQNTRd2kTQU\nOAJ4rOdalZoFTEzLE4FbGxjL33T+h0s+RgPOWbrYfDXwZERcnFvV0HPWXVyNPmeSdpL0prQ8hGwA\nz+9o/PnqMq5Gn6+IOCsido2I0WTfV/dFxKeo4Hw19dQ2VWjA9Dl91QLckn1HsBXw44j4WSMCkXQ9\n0AaMkLQUOAe4AJgp6RRgCdlIp8EQV5ukA8i6ExYDn6s6LrK/QE8EFqT+f4Bv0Phz1l1cxzf4nI0E\nZqQRo1sAMyPidkm/prHnq7u4rh0En7GulP758rBoMzOrhLvUzMysEk44ZmZWCSccMzOrhBOOmZlV\nwgnHzMwq4YRjZmaVcMIxq4Ckttw08G2SDs2t+7ykk9LyyZJ2ya1rl9Taj/frVz2zMvmHn2bVayO7\nLcKvACLiity6k8l+ef5s5VGZlcwJx6wO0vRBM8nm0tsSOB9YQzYr70tk90LpnGX588BGSZ8CzgAO\nJ0tAi8nxNnn3AAABkElEQVTuT3KdpL8A/1jzHkcA3wK2Af4AfDo3/X1PsXVZL82lNQP4CNnU+cdG\nxO/6eQrMeuUuNbP6GAc8GxHvSDdz+xnZxIwfAQ4EdgaIiMXAFcAlEXFARPyicwcRcRMwDzghrftL\n5zpJI4CzgfdHxLvSdl/uLag+1HshlV8OfKW/B2/WF27hmNXHAuAiSRcCtwPrgKcjYiFAmo13IPck\nOgTYF/hlmh/vDcCv61Cvcybq+WR3yDQrjROOWR1ExO8lvQv4EPBvwL11fguR3cDr+DrX25CeN+Lv\nAyuZu9TM6iCNLHspIn4EfAc4FBgt6a1pk/wX/jpgu2521d26h4DDJO2V3m+opH36EFp/65nVnf+i\nMauPMcB3JL0KvAL8b2AEcIekl4Bf8FoiuQ24SdJ4skEDedOBK2oHDUTEnySdDFwvaZtUfDbw+56C\n6m89szL49gRmZlYJd6mZmVkl3KVmtgmTdAuwR03x1yPi7kbEY9YTd6mZmVkl3KVmZmaVcMIxM7NK\nOOGYmVklnHDMzKwS/x8Karkzt+2BugAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f121b035ed0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(wdtitle_len, bins = 40, range=(0, 40))\n",
    "plt.xlabel('sdtitle_len')\n",
    "plt.ylabel('question_number')\n",
    "plt.title('train question words num')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEXCAYAAABYsbiOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHjxJREFUeJzt3XucHXWd5vHPQ0DABBMgTBMIkqDR2WA0kgYUL9MRL1HA\nMK5iFJzA4GTdQWdc8ZIos+I4WeMF1xEEJgKbQJA2RtmEmwxEGvGCSJAhchOUjhIggQAhDZlIwnf+\nqF/Loe3q7jrddU6l+3m/Xv06dep2nvqlc75dv7opIjAzM+vNLs0OYGZm1eUiYWZmuVwkzMwsl4uE\nmZnlcpEwM7NcLhJmZpbLRcIaStL5kv6p2TkGQ1KXpEOanaNeks6UtKzZOWzn4CJhAyapU9JbB7OO\niPhIRHxxqDKVTVKHpA/XjouIMRHxu2ZlMmskFwkbMpJ2bXYGe54y/j9ug+JfIBsQSZcALwWuSN0t\nn5Y0SVJIOlXS74EfpXm/J+kRSZsl/VjSoTXrWSLpX9Jwm6QHJZ0uaaOkhyWd0keGyZJulLRF0nWS\nzunuNuleV4/5/7TnI2kXSfMl/VbSJknLJe2Tpu0haVka/6SkX0pqkbQQeBNwTtrmc9L8IenlaXis\npIslPSppnaQzur+YJZ0s6SeSvibpCUkPSHpnzradIumKmvf3Sfpezfs/SJqeho9KGTen16Nq5uuQ\ntFDST4FngEN6thswvmb+Xrc9J2OnpE9KuiN99ncl7VG7rT3mr22nJZLOlXRNasufStpf0jdS29wj\n6bV5//bWPC4SNiAR8SHg98BxqbvlKzWT/wr4b8A70vtrgCnAXwC3AZf2ser9gbHAgcCpwLck7Z0z\n73eANWRfcl8E5hbYhI8Bx6esBwBPAN9K0+amDAcB+wIfAbZGxOeAm4CPpm3+aC/rPTste0ha998A\ntYXuSODelPkrwIWS1Mt6bgTelIrZAcCLgNcDpOMfY4A7UmG7Cvhmyvp14CpJ+9as60PAPGAvYB19\nt1uv295Lvm4nALOAycCrgZP7mLe3Zc9IObYBPyf7/RgPrEjbYhXjImFD4cyIeDoitgJExEURsSUi\ntgFnAq+RNDZn2WeBf46IZyPiaqALeGXPmSS9FDgc+KeI2BYRPwau6DlfHz4CfC4iHqzJ9d7URfYs\n2RfkyyNiR0SsiYin+luhpFHAHGBB2t5O4CyyL+lu6yLi2xGxA1gKTAD+7C/1dIxjCzAdeDNwLfCQ\npL8kKz43RcRzwDHAfRFxSURsj4jLgHuA42pWtyQi7oyI7enz+mq3otv+zYh4KCIeT+uZ3l871bg8\nrf8/gcuB/4yIi1PbfBfwnkQFuUjYUPhD94CkUZIWpW6dp4DONGl8r0vCpvRl1u0Zsr+aezoAeCIi\nnq4Zt65AxoOBy1OXypPA3cAOsi/sS8i+lNslPSTpK5J2G8A6xwO79cixjmyvqNsj3QMR8Uwa7G37\nINubaCMrEjcCHWQF4q/Se8jaoed29/zMP9QM99duRbf9kZrhvH+rPBtqhrf28r7IuqxBXCSsiLxb\nBteO/yAwG3grWTfGpDS+ty6WIh4G9pY0umbcS2uGnwZe3P0m/ZW/X830PwDvjIhxNT97RMT6tBfz\nhYiYChwFHEvWbdRz23p6jOwv8YN7ZFpfdOOS7iLxpjR8I39eJB7q8Xm9fWZt5j7brZ9tL6Jn++9f\nxzqsglwkrIgNZH3vfdmLrL95E9mXxv8Zig+OiHXArcAXJL1I0ht5YRfLb4A9JB2T/hI+A9i9Zvr5\nwEJJBwNI2k/S7DQ8U9K0VFieIvvify4tl7vNqZtkeVrvXmndnwDqvQbhRmAmsGdEPEh2PGQWWXfQ\nr9I8VwOvkPRBSbtKej8wFbgyJ2Of7dbPthfxH8Chkqang9ln1rEOqyAXCSviS8AZqcvmkznzXEzW\nnbEeuAu4eQg//4NkB4IfBz6fPguAiNgM/D1wQfrsp4Has53+FVgF/LukLSnXkWna/mQHTp8i64a6\nkawbpnu596YzcL7ZS6aPpc/6HfATsoPEF9WzcRHxG7JjMjel90+l9f40FSQiYhPZX/unkxXiTwPH\nRsRjfaw6t93oe9uLZv9n4HrgPrK2sGFAfuiQ7awknUl2wPWkZmcxG668J2FmZrlcJMzMLJe7m8zM\nLJf3JMzMLNdOf0O28ePHx6RJk+pa9umnn2b06NH9z9hgVc0F1c3mXMU4VzHDMdeaNWsei4j9+p0x\nInbqnxkzZkS9brjhhrqXLVNVc0VUN5tzFeNcxQzHXMCtMYDvWHc3mZlZLhcJMzPL5SJhZma5XCTM\nzCyXi4SZmeVykTAzs1wuEmZmlstFwszMcrlImJlZrp3+thyDsXb9Zk6ef1Xu9M5FxzQwjZlZ9XhP\nwszMcrlImJlZLhcJMzPLVXqRkNQpaa2k2yXdmsbtI+k6Sfel171r5l8g6X5J90p6R9n5zMwsX6P2\nJGZGxPSIaE3v5wOrI2IKsDq9R9JUYA5wKDALOFfSqAZlNDOzHprV3TQbWJqGlwLH14xvj4htEfEA\ncD9wRBPymZkZDXjGtaQHgM3ADuDfImKxpCcjYlyaLuCJiBgn6Rzg5ohYlqZdCFwTESt6rHMeMA+g\npaVlRnt7e13ZNj6+mQ1b86dPO3BsXesdrK6uLsaMGdOUz+5PVbM5VzHOVcxwzDVz5sw1Nb07uRpx\nncQbI2K9pL8ArpN0T+3EiAhJhSpVRCwGFgO0trZGW1tbXcHOvnQlZ63Nb4LOE+tb72B1dHRQ7zaV\nrarZnKsY5ypmJOcqvbspItan143A5WTdRxskTQBIrxvT7OuBg2oWn5jGmZlZE5RaJCSNlrRX9zDw\nduDXwCpgbpptLrAyDa8C5kjaXdJkYApwS5kZzcwsX9ndTS3A5dlhB3YFvhMRP5T0S2C5pFOBdcAJ\nABFxp6TlwF3AduC0iNhRckYzM8tRapGIiN8Br+ll/Cbg6JxlFgILy8xlZmYD4yuuzcwsl4uEmZnl\ncpEwM7NcLhJmZpbLRcLMzHK5SJiZWS4XCTMzy+UiYWZmuVwkzMwsl4uEmZnlcpEwM7NcLhJmZpbL\nRcLMzHK5SJiZWa5GPL50pzVp/lV9Tu9cdEyDkpiZNYf3JMzMLJeLhJmZ5XKRMDOzXC4SZmaWy0XC\nzMxyuUiYmVkuFwkzM8vlImFmZrlcJMzMLJeLhJmZ5XKRMDOzXC4SZmaWy0XCzMxyuUiYmVkuFwkz\nM8vlImFmZrkaUiQkjZL0K0lXpvf7SLpO0n3pde+aeRdIul/SvZLe0Yh8ZmbWu0btSfwjcHfN+/nA\n6oiYAqxO75E0FZgDHArMAs6VNKpBGc3MrIfSi4SkicAxwAU1o2cDS9PwUuD4mvHtEbEtIh4A7geO\nKDujmZn1ThFR7gdIK4AvAXsBn4yIYyU9GRHj0nQBT0TEOEnnADdHxLI07ULgmohY0WOd84B5AC0t\nLTPa29vryrbx8c1s2FrvlsG0A8fWv3Afurq6GDNmTCnrHqyqZnOuYpyrmOGYa+bMmWsiorW/+Xat\na+0DJOlYYGNErJHU1ts8ERGSClWqiFgMLAZobW2NtrZeV92vsy9dyVlr62+CzhPr+9z+dHR0UO82\nla2q2ZyrGOcqZiTnKrVIAG8A3i3pXcAewEskLQM2SJoQEQ9LmgBsTPOvBw6qWX5iGmdmZk1Q6jGJ\niFgQERMjYhLZAekfRcRJwCpgbpptLrAyDa8C5kjaXdJkYApwS5kZzcwsX9l7EnkWAcslnQqsA04A\niIg7JS0H7gK2A6dFxI4mZTQzG/EaViQiogPoSMObgKNz5lsILGxULjMzy+crrs3MLJeLhJmZ5XKR\nMDOzXAMqEsoc1P+cZmY2nAyoSER2WfbVJWcxM7OKKdLddJukw0tLYmZmlVPkFNgjgRMlrQOeBkS2\nk/HqUpKZmVnTFSkSfraDmdkIM+DupohYR3Zfpbek4WeKLG9mZjufAX/JS/o88BlgQRq1G7CsjFBm\nZlYNRfYE/hp4N9nxCCLiIbJnRJiZ2TBVpEj8MZ0KGwCSRpcTyczMqqJIkVgu6d+AcZL+Drge+HY5\nsczMrAoGfHZTRHxN0tuAp4BXAP87Iq4rLZmZmTVd0VuFrwX2JOtyWjv0cczMrEqKnN30YbKnxL0H\neC9ws6S/LSuYmZk1X5E9iU8Br00PDELSvsDPgIvKCGZmZs1X5MD1JmBLzfstaZyZmQ1T/e5JSPpE\nGrwf+IWklWTHJGYDd5SYzczMmmwg3U3dF8z9Nv10Wzn0cczMrEr6LRIR8YVGBNkZTZp/Ve60zkXH\nNDCJmVk5BnzgWlIr8Dng4NrlfKtwM7Phq8jZTZeSneG0FniunDhmZlYlRYrEoxGxqrQkZmZWOUWK\nxOclXQCsBrZ1j4yIHwx5qhGgr+MZS2b53olmVg1FisQpwF+SPUeiu7spABcJM7NhqkiRODwiXlla\nEjMzq5wiV1z/TNLU0pKYmVnlFNmTeB1wu6QHyI5JCAifAmtmNnwVKRKzSkthZmaVVKRIRGkpzMys\nkooUiavICoWAPYDJwL3AoXkLSNoD+DGwe/qsFRHxeUn7AN8FJgGdwAkR8URaZgFwKrAD+IeIuLbY\nJpmZ2VAZ8IHriJgWEa9Or1OAI4Cf97PYNuAtEfEaYDowS9LrgPnA6rSe1ek96cD4HLLCMws4V9Ko\nohtlZmZDo8jZTS8QEbcBR/YzT0REV3q7W/rpvs340jR+KXB8Gp4NtEfEtoh4gOz25EfUm9HMzAZH\nEQM71FDzXAnIisthwL4R8Y5+lhsFrAFeDnwrIj4j6cmIGJemC3giIsZJOge4OSKWpWkXAtdExIoe\n65wHzANoaWmZ0d7ePqBt6Gnj45vZsLWuRfs17cCxfU5fu35z7rTJY0cxZsyYoY40JLq6uiqZzbmK\nca5ihmOumTNnromI1v7mK3JMYq+a4e1kxyi+399CEbEDmC5pHHC5pFf1mB6SCh0Uj4jFwGKA1tbW\naGtrK7L4n5x96UrOWlukCQau88S2Pqef3M9tOerdprJ1dHRUMptzFeNcxYzkXAP+hhzscyUi4klJ\nN5Ada9ggaUJEPCxpArAxzbYeOKhmsYlpnJmZNUGR50m8Avgk2RlJtc+TeEsfy+wHPJsKxJ7A24Av\nA6uAucCi9Nr9lLtVwHckfR04AJgC3FJgeyqjrxv4mZntLIr0tXwPOB+4gOz01IGYACxNxyV2AZZH\nxJWSfg4sl3QqsA44ASAi7pS0HLiLrEvrtNRdZWZmTVCkSGyPiPOKrDwi7gBe28v4TcDROcssBBYW\n+RwzMytHkVNgr5D095ImSNqn+6e0ZGZm1nRF9iTmptdP1YwL4JChi2NmZlVS5OymyX1Nl/S2iLhu\n8JHMzKwq6r7iuhdfHsJ1mZlZBQxlkdAQrsvMzCpgKIuEbyVuZjbMDGWRMDOzYWYoi0TnEK7LzMwq\noNDd7SQdxZ/fluPi9PqeIU1mZmZNV+TeTZcALwNu5/nbcgRwcQm5zMysAorsSbQCU2OgD6AwM7Od\nXpFjEr8G9i8riJmZVU+RPYnxwF2SbiF7djUAEfHuIU9lZmaVUKRInFlWCDMzq6Yi9266UVILcHga\ndUtEbOxrGTMz27kN+JiEpBPInhL3PrKHBP1C0nvLCmZmZs1XpLvpc8Dh3XsP6dGk1wMryghmZmbN\nV+Tspl16dC9tKri8mZntZIrsSfxQ0rXAZen9+4Grhz6SmZlVRZED15+S9N+BN6RRiyPi8nJimZlZ\nFRS6d1NEfB/4fklZzMysYvotEpJ+EhFvlLSFFz4zQkBExEtKS2dmZk3Vb5GIiDem173Kj2NmZlVS\n5DqJSwYyzszMho8ip7AeWvtG0q7AjKGNY2ZmVdJvkZC0IB2PeLWkp9LPFmADsLL0hGZm1jT9FomI\n+FI6HvHViHhJ+tkrIvaNiAUNyGhmZk1SpLvpSkmjASSdJOnrkg4uKZeZmVVAkSJxHvCMpNcApwO/\nxY8uNTMb1opcTLc9IkLSbOCciLhQ0qllBbP6TZp/Ve60zkXHNDCJme3sihSJLZIWAB8C3iRpF2C3\ncmKZmVkVFOluej/ZY0v/NiIeASYCX+1rAUkHSbpB0l2S7pT0j2n8PpKuk3Rfet27ZpkFku6XdK+k\nd9SxTWZmNkQGXCRSYfg+sHsa9RjQ3w3+tgOnR8RU4HXAaZKmAvOB1RExBVid3pOmzSG7JmMWcK6k\nUQPfHDMzG0oD7m6S9HfAPGAf4GXAgcD5wNF5y0TEw8DDaXiLpLvTcrOBtjTbUqAD+Ewa3x4R24AH\nJN0PHAH8vMhG7ezWrt/MyT6uYGYVoIjofy5A0u1kX9i/iIjXpnFrI2LaAJefBPwYeBXw+4gYl8YL\neCIixkk6B7g5IpalaRcC10TEih7rmkdWsGhpaZnR3t4+oG3oaePjm9mwta5FS9WyJ6Xlmnbg2EEt\n39XVxZgxY4YozdBxrmKcq5jhmGvmzJlrIqK1v/mKHLjeFhF/zL7T/3RbjgFVGEljyLqqPh4RT3Wv\nA7LbyEoaWKV6fpnFwGKA1tbWaGtrK7L4n5x96UrOWlvobukNcfq07aXl6jyxbVDLd3R0UG97l8m5\ninGuYkZyriIHrm+U9FlgT0lvA74HXNHfQpJ2IysQl0bED9LoDZImpOkTgO7Hoq4HDqpZfGIaZ2Zm\nTVCkSMwHHgXWAv+D7NGlZ/S1QOpKuhC4OyK+XjNpFTA3Dc/l+XtArQLmSNpd0mRgCnBLgYxmZjaE\nijy+9Dng2+lnoN5Adl3F2nRMA+CzwCJgeboYbx1wQvqMOyUtB+4iOzPqtIjYUeDzzMxsCBU5u+kB\nejkGERGH5C0TET8he4Jdb3o9KyoiFgILB5rLzMzKU+ToaO1R8D2A95GdDms7kb5u2QE+vdbMXqjI\nxXSban7WR8Q3AH+jmJkNY0W6mw6rebsL2Z5F9c4fNTOzIVPkS/4snj8msR3oJOtyMjOzYapIkbiS\nrEh0H4gO4NjuC+N6nOJqZmbDQJEiMQM4nOyaBgHHkV3DcF8JuaxJ+juwvWTW6AYlMbMqKFIkJgKH\nRcQWAElnAldFxEllBDMzs+YrcsV1C/DHmvd/TOPMzGyYKrIncTFwi6TuZ0gcDywZ8kRmZlYZRW7L\nsVDSNcCb0qhTIuJX5cQyM7MqKHSdQ0TcBtxWUhYzM6uYIsckzMxshHGRMDOzXC4SZmaWy0XCzMxy\nuUiYmVkuFwkzM8vlW31bIWvXb+bknPs7+YFFZsOP9yTMzCyXi4SZmeVykTAzs1wuEmZmlstFwszM\ncrlImJlZLhcJMzPL5SJhZma5XCTMzCyXi4SZmeXybTlsyEzKuV1HN9+2w2zn4yJhleACY1ZNLhLW\nMP0VAjOrnlKLhKSLgGOBjRHxqjRuH+C7wCSgEzghIp5I0xYApwI7gH+IiGvLzGc7j/4KzJJZoxuU\nxGxkKfvA9RJgVo9x84HVETEFWJ3eI2kqMAc4NC1zrqRRJeczM7M+lFokIuLHwOM9Rs8GlqbhpcDx\nNePbI2JbRDwA3A8cUWY+MzPrmyKi3A+QJgFX1nQ3PRkR49KwgCciYpykc4CbI2JZmnYhcE1ErOhl\nnfOAeQAtLS0z2tvb68q28fHNbNha16KlatmTSuaC6mabPHYUY8aMaXaMP9PV1eVcBThXMYPJNXPm\nzDUR0drffE09cB0RIalwlYqIxcBigNbW1mhra6vr88++dCVnra3esfvTp22vZC6obrYls0ZT7+9B\nmTo6OpyrAOcqphG5mnEx3QZJEwDS68Y0fj1wUM18E9M4MzNrkmYUiVXA3DQ8F1hZM36OpN0lTQam\nALc0IZ+ZmSVlnwJ7GdAGjJf0IPB5YBGwXNKpwDrgBICIuFPScuAuYDtwWkTsKDOfmZn1rdQiEREf\nyJl0dM78C4GF5SUyM7MifIM/MzPL5SJhZma5qncuo1kd1q7fzMk5t+7o7+aAfd3ywzcWtJHORcKG\nvcHcWNB3p7WRzt1NZmaWy0XCzMxyuUiYmVkuFwkzM8vlImFmZrlcJMzMLJeLhJmZ5XKRMDOzXL6Y\nzmwQ+rvYbsms0XUv6wv1rAq8J2FmZrlcJMzMLJeLhJmZ5fIxCbMS9XV3WrOdgfckzMwsl4uEmZnl\ncneTWUX5YUhWBd6TMDOzXC4SZmaWy91NZjuhwTySFfq+EtyslvckzMwsl4uEmZnlcpEwM7NcPiZh\nNgL1dSW4T6+1Wt6TMDOzXN6TMLMh5YsAhxcXCTN7gTIfhjSYhzRZc7hImFkhg71Gw3YulSsSkmYB\n/wqMAi6IiEVNjmRmDTKYW6u7K6sclSoSkkYB3wLeBjwI/FLSqoi4q7nJzGwka1bxqkL3XKWKBHAE\ncH9E/A5AUjswG3CRMLM+ldkNdvq0+pct8xhPIygimp3hTyS9F5gVER9O7z8EHBkRH+0x3zxgXnr7\nSuDeOj9yPPBYncuWqaq5oLrZnKsY5ypmOOY6OCL262+mqu1JDEhELAYWD3Y9km6NiNYhiDSkqpoL\nqpvNuYpxrmJGcq6qXUy3Hjio5v3ENM7MzJqgakXil8AUSZMlvQiYA6xqciYzsxGrUt1NEbFd0keB\na8lOgb0oIu4s8SMH3WVVkqrmgupmc65inKuYEZurUgeuzcysWqrW3WRmZhXiImFmZrlGXJGQNErS\nryRdmd7vI+k6Sfel170rkutMSesl3Z5+3tWETJ2S1qbPvzWNa3p75eSqQnuNk7RC0j2S7pb0+oq0\nV2+5mtpekl5Z89m3S3pK0seb3V595KrC79f/knSnpF9LukzSHo1orxF3TELSJ4BW4CURcaykrwCP\nR8QiSfOBvSPiMxXIdSbQFRFfa3SWmkydQGtEPFYzruntlZPrTJrfXkuBmyLignR23ouBz9L89uot\n18dpcnt1S7fjWQ8cCZxGBf4/9pLrFJrYXpIOBH4CTI2IrZKWA1cDUym5vUbUnoSkicAxwAU1o2cD\nS9PwUuD4iuSqqqa3VxVJGgu8GbgQICL+GBFP0uT26iNXlRwN/DYi1lGt36/aXFWwK7CnpF3JCv1D\nNKC9RlSRAL4BfBp4rmZcS0Q8nIYfAVoanqr3XAAfk3SHpIua1A0WwPWS1ii7FQpUo716ywXNba/J\nwKPA/0vdhhdIGk3z2ysvFzT/96vbHOCyNNzs9qpVmwua2F4RsR74GvB74GFgc0T8Ow1orxFTJCQd\nC2yMiDV580TW99bQ/rc+cp0HHAJMJ/ulOKuRuZI3RsR04J3AaZLeXDuxGe3VR65mt9euwGHAeRHx\nWuBpYH7tDE1qr7xczW4vAFL317uB7/Wc1sTfr95yNbW9UlGaTVb0DwBGSzqpdp6y2mvEFAngDcC7\nU392O/AWScuADZImAKTXjVXIFREbImJHRDwHfJvsDrkNlf56ISI2ApenDM1ur15zVaC9HgQejIhf\npPcryL6cm91eveaqQHt1eydwW0RsSO+b3V695qpAe70VeCAiHo2IZ4EfAEfRgPYaMUUiIhZExMSI\nmES2G/mjiDiJ7LYfc9Nsc4GVVcjV/Q+f/DXw60bmkjRa0l7dw8DbU4amtlderma3V0Q8AvxB0ivT\nqKPJbnHf7N+vXnM1u71qfIAXduk0tb1qvCBXBdrr98DrJL1Yksj+He+mEe0VESPuB2gDrkzD+wKr\ngfuA64F9KpLrEmAtcEf6RZjQ4CyHAP+Rfu4EPleF9uojV1PbK2WYDtyaMvx/YO9mt1cfuarQXqOB\nTcDYmnFVaK/eclWhvb4A3ENWoC4Bdm9Ee424U2DNzGzgRkx3k5mZFeciYWZmuVwkzMwsl4uEmZnl\ncpEwM7NcLhJmZpbLRcKsF5La9Pxt29skHVUz7SOS/iYNnyzpgJppHZJa6/i8upYzK1ulnnFtVlFt\nQBfwM4CIOL9m2slkFzc91PBUZg3gImEjUrqlx3JgIjAK+CKwmeyOvM+Q3bsfSZOAjwA70g3VPkZ2\nS4QuoJPsGSCXStoKvL7HZ7yd7CrZ3YHfAqdERNcAsvW6XLq/11LgOGA34H0RcU+dTWA2IO5uspFq\nFvBQRLwmIl4F/JDsxm3HATOA/QEiohM4H/i/ETE9Im7qXkFErCC73cWJadrW7mmSxgNnAG+NiMPS\nfJ/oL9QAlnssjT8P+GS9G282UN6TsJFqLXCWpC8DVwJbyO6yeR9AukPwvD6W78/ryJ4a9tPsfmy8\nCPj5ECz3g/S6BnjPIPKZDYiLhI1IEfEbSYcB7wL+hewmaUNJwHUR8YEhXm5bet2B//9aA7i7yUak\ndEbSMxGxDPgq2b35J0l6WZql9kt6C7BXzqrypt0MvEHSy9PnjZb0igFEq3c5s1L4LxEbqaYBX5X0\nHPAs8D+B8cBVkp4BbuL5L/8rgBWSZpMduK61BDi/54HriHhU0snAZZJ2T6PPAH7TV6h6lzMri28V\nbmZmudzdZGZmudzdZNZAki4ne5h9rc9ExLXNyGPWH3c3mZlZLnc3mZlZLhcJMzPL5SJhZma5XCTM\nzCzXfwFljK+kvmbEYgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fb064184dd0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(wdtitle_len, bins = 40, range=(40, 80))\n",
    "plt.xlabel('sdtitle_len')\n",
    "plt.ylabel('question_number')\n",
    "plt.title('train question words num')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 训练话题描述长度分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk import FreqDist  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "非空描述数量 2165150 \n"
     ]
    }
   ],
   "source": [
    "index_content = df_train.word_content.apply(lambda wc: type(wc) is not float)\n",
    "print '非空描述数量 %d ' % sum(index_content)\n",
    "df_train_content = df_train.loc[index_content, :].copy()  # 把带有描述部分取出来\n",
    "# 话题 描述\n",
    "df_train_content['word_content'] = df_train_content.word_content.apply(lambda text: text.split(','))\n",
    "df_train_content['ch_content'] = df_train_content.ch_content.apply(lambda text: text.split(','))\n",
    "df_train_content['wdcontent_len'] = df_train_content.word_content.apply(lambda ws: len(ws))\n",
    "df_train_content['chcontent_len'] = df_train_content.ch_content.apply(lambda chs: len(chs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<FreqDist with 1990 samples and 2165150 outcomes>\n",
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaIAAAETCAYAAAB0nQK/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXmYVdWV6H+r5ioooACBYhIQHBAVvaU4BKPRAIkmmrRG\nTTqibTR5Jp3kmdfP2J3EtENaX5K2O5NpEoiaCY0ZFCMxOMaoKBSgDIoU8zzVPNe9d70/9j7Uqcut\nwZKqe6tq/b7vfvecddbee51zz93r7L3X2VtUFcMwDMNIFRmpNsAwDMMY2JgjMgzDMFKKOSLDMAwj\npZgjMgzDMFKKOSLDMAwjpZgjMgzDMFKKOSLDMAwjpZgjMgzDMFKKOSLDMAwjpWSl2oC+wMiRI3XS\npEndStvQ0HBkOz8/v81+ezLT7f+66WqX6ZpuMt3uUlpaekhVj+tMzxxRF5g0aRIrV67sVtrS0tIj\n25FIpM1+ezLT7f+66WqX6ZpuMt3uIiLbu6JnXXOGYRhGSjFHZBiGYaQUc0SGYRhGSjFHZBiGYaQU\nc0SGYRhGSjFHZBiGYaQUc0SGYRhGSrH3iAzDMAYAcVXqWpSapjiyo4LSvY3UNCm1zXFqW+Is2b2e\nrbsrqW1W6lvi1DYrdc1x7szczRUzx/WobT3miERkAvAIMBpQYIGq/reIDAceBSYB24BPqWqFT3MH\ncBMQA76sqs94eQR4CMgHnga+oqoqIrm+jAhwGLhGVbf5NPOBb3hz7lHVh718MrAYGAGUAp9V1eae\nug6GYRjHmpaYUtMcd5+mODXNysboDtZvqqU6JNPXX2FfeQ01zXHqmpX4kRwOJcm1LmlZh2t7vnrs\nyRZRFPiaqq4SkUKgVESWATcAz6nqfSLydeDrwO0iMh24FjgVGAs8KyInqmoMeBC4GXgd54jmAUtx\nTqtCVaeKyLXA/cA13tndCZTgnGCpiDzpHd79wAOqulhEfurzeLAHr4NhGEanqCo1zcqBuhiHG2JU\nNcV5pXITG7ZUU9UUo7IxTlVjnJolz1DTGE2SQ2USWVObvYJsoTAngzFFg8mINlCYk8HgnAwG5Qgn\nTzme8v27GJydwaCcDAZlC4NzMvjgrIk9cr5heswRqepeYK/frhGRt4FxwBXARV7tYeBF4HYvX6yq\nTcBWESkDzhGRbcAQVV0OICKPAFfiHNEVwLd9Xo8DPxIRAeYCy1S13KdZBswTkcXAh4BPh8r/NuaI\nDMPoYVSVyqY4q3ZU8MrOBg7UxThYH+NAXYzal15ix+E6mmKakKo6aV6ZgnMiuRkU5jjnMnnsKJpq\nyinMES/PoOS0U9izbRNDcoRBORlkZQjQ3hQ/kyktLT+qrLzszGNy/h3RK2NEIjIJOBPXohntnRTA\nPlzXHTgntTyUbJeXtfjtRHmQZieAqkZFpArX5XZEnpBmBFCpqtEkeRmGYbwv6lvi7KuNsb8uxhs1\nmyndWMXB+hgHvdNpiQMcTJLSdX8VZAujCjIZUZDJsLwMTp40loaKAwzNzWBoXgbD8jL54DlnUvb2\nW2SItMkhEjn9aOcyeTil5ekfCiCqiR74GBcgMhh4CbhXVf8gIpWqOix0vEJVi0TkR8ByVf2Vly/E\ntXq2Afep6qVePhu4XVUvF5F1wDxV3eWPbQZm4br/8lT1Hi//JtCAG2darqpTvXwCsFRVZySx+xbg\nFoDi4uLIkiVLunX+9fX1R7YLCgra7LcnM93+r5uudplux/L8/Hz2VtSxvz7Ogfo4B+ri7K+Pc6hR\n2Fcbpaa54/p0cLYwalAmw/NgZH4GI/OFkfkZTBhewCBpZlB2W+eSLtehu5SUlJSqaklnej3qKkUk\nG/g98GtV/YMX7xeRYlXdKyLFwAEv3w1MCCUf72W7/XaiPJxml4hkAUNxQQu7ae3+C9K86I8NE5Es\n3yoK59UGVV0ALAAoKSnR7s5Aa7Nvm26qyzLd96YbU+VQfQwtnMjLZe+wry7qWjm1MQ421NHQEjsq\n34CcDBg1OIsxgzI5fcpYpP4QowdlclxBJscNyiQ/K6PPXIewvKfpyag5ARYCb6vqf4YOPQnMB+7z\n30+E5L8Rkf/EBStMA95Q1ZiIVIvIubiuveuBHybk9RpwFfC8j6Z7BviOiBR5vTnAHf7YC153cUL5\nhmEMEFSVysY4e2pjbHpjB6+tr2FPTZS9tTH21UaJxiF5ZBkMzhHGDMpi9OBMxgzOZMygTC48azpV\nezZTlJdxpMssEpmetGI3jqYnW0QXAJ8F1orIGi/7V5wDekxEbgK2A58CUNX1IvIYsAEXcfdFHzEH\ncCut4dtL/Qeco/ulD2wox0XdoarlInI3sMLr3RUELuACIxaLyD3Aap+HYRj9kJrmOHtrouypjbG3\nJsovNq5i/Y5D7K2N0RgNutGOHqAfnpfBtOJhDKaBMYMyvdPJYu4FEco2vHWUfmTKCEortvXsyfRj\nejJq7u+AtHP4knbS3Avcm0S+EjhqHEdVG4Gr28lrEbAoiXwLcE67hhuG0aeIxuLsromyqzrK7uoo\nu2uiVL/+Cpv2VSUZs2l9V2ZwjjB2cBYzJo0it6mK4sJMxhZmMWZw+11oQ/Oze+GMBh7pH05hGIYB\n1DdH2VLRwq7qKLtCjmf/H/5Cy1Fhz40A5GUKxYWZFA/OYmxhJufNmErz4Z0UD86iMNfNcBaJzLQu\ntBRjjsgwjLSiIeretVm2pd45G+90Dv7umXbTHFeQwfghWYwrzGLckCwuOms6tXvdmI2EwpwjkfGU\nlu7vjdMw3gPmiAzDSAmxuLJpfw2v7Gxge1WUHVVRtldFOVAXozWYtpXsTGH0oEzGF2YecTrjh2Rx\n2YUlvL32zTa6kRNGUFq5rXdOxHjfmCMyDKPHqW+J8/qWw/xlUx1bK1rYVum616Lxo1snWQJTRxcy\nKqeFCUNcC2f8kCw+Mvts3lqz+ij9ghyrxvo69gsahnFMOVTbxOp9TWytbGFLRZStlS3sq03eyhlf\nlM+YvDgTh2Zx/NAsJg7NYmxhFrPOLjlq3CY701at6a+YIzIMo1uoKrsrG3h9d2Mbp1P+u2eP0s3K\ngFOKhzI6p5nJw7KZNMw5ndnnnm2BAoY5IsMwukZFY4yy8pYjn80VLdQ0H921Nignk4lDMpg8LJvJ\nw7KYUpTNuCFZnJuklWMYYI7IMIwk1LfE2VzR6nR2/PU59lQ1HqVXVJDNxEJp43Quu/AcVq9elQKr\njb6KOSLDGODE4sq2qijvHm6mrLyFXS+9xOYDtSS+mZOfJZxQlM3U4a2fuR84m1Wr2jqdjIz23mM3\njOSYIzKMAUZdS5x3D7fw0l838sK6cjYdbqEx4YXQLIFJw1odzhUfOIOKnRvJTFh6QMScjvH+MUdk\nGP0YVeVgfYw/rd7N0lVVvHOohR1VUd/aqTiiN2ZwJieNyGba8Gw+dv7pNOwtIzuz1clMG11I6S5z\nOkbPYI7IMPoZB+qirDvYzPoDzaw72Myh+jjhmaSzBKYUZfPBU8dTFKvgpBHZDMtrXYXzjAnDKD1g\nTsfoPcwRGUYf52B9jHUHmll/sJlNzz7ProqGNscHZwuzTjiOMdn1nDwihxOGZ5ObKbZMgZE2mCMy\njD5GQ0ucdQeb+ePOtSxbe5D9dW0XahuULUw/LocZx+UwY1QOE4dmcXaJhU4b6Ys5IsNIc1SVrZUt\nrNnXxOp9zWw81IxbSqcSgIJsYfpI53SuuvAM6vduOiqowDDSGXNEhpGG1LfEWbOvmdK9jaz/y3Mc\nrGk6ciwDOGlENh85cxKj44c5YXj2EcczY9xQSveZEzL6Fj25VPgi4HLggKrO8LJHgZO8yjCgUlVn\nisgk4G1goz+2XFW/4NNEaF2d9WngK37J71zgESACHAauUdVtPs184Bs+r3tU9WEvn4xbInwEUAp8\nVlWbe+L8DeO9crA+xiOvbeP3y8tZfyBo9TiG52Uwc0wuM8fkcProXApzMohETrTuNqNf0JMtooeA\nH+GcBQCqek2wLSLfB6pC+ptVdWaSfB4EbgZexzmiebilwm8CKlR1qohcC9wPXCMiw4E7gRJAgVIR\neVJVK7zOA6q6WER+6vN48Bidr2G8J1Tdi6Rv7G5kxZ4mtlZGgYOAa/WcMjKbkrF5/OOHzqRm10Z7\nZ8fot/TkUuF/8y2doxD3j/oU8KGO8hCRYmCIqi73+48AV+Ic0RXAt73q48CPfL5zgWWqWu7TLAPm\nichiX96nfZqHfXpzREavsqPKvUz6uzcOsbe2NdAgL1O46OTRTC1oIFKcyxC/guhJYwop3W1OyOi/\npGqMaDawX1U3hWSTRWQNrpX0DVV9GRgH7Arp7PIy/PdOAFWNikgVrsvtiDwhzQhcV2A0SV6G0aPs\nqYny6s5G/r6zkZ3VUVxvMgzJzeDccbmcPTaPGaNyOO+ciHW3GQMOUU2cUeoYZu5aRE8FY0Qh+YNA\nmap+3+/nAoNV9bAfE/oTcCpwInCfql7q9WYDt6vq5SKyDpinqrv8sc3ALOAGIE9V7/HybwINuK7C\n5ao61csnAEsTbQvZeAtwC0BxcXFkyZIl3boG9fX1R7YLCgra7LcnM93+oVvdFOe1PS28uifGlsro\nEZ1B2XDe+Hwio4RThmeSGZqbLd3OwXRNt6Cg4ChZVykpKSlV1ZLO9Hq9RSQiWcAncUEGAKhqE9Dk\nt0u9UzkR2A2MDyUf72X47wnALp/nUNxj5m7gooQ0L/pjw0Qky7eKwnkdhaouABYAlJSUaCQSaU+1\nQ8JPt5HI0U+7yWSm23d1T595Jguffo3ntzZQureJYAq3/Cxh1rhczp+Qz+mjc9pdEiEdzsF0TTdR\n3tOkomvuUuCdoCUDICLHAeWqGhORKcA0YIuqlotItYiciwtWuB74oU/2JDAfeA24CnjeR9M9A3xH\nRIq83hzgDn/sBa+72Kd9osfP1hgQHKiLsmxLA59f+hyHal0gZgYQKc7lny4+leGNu8nJtHEew0hG\nT4Zv/xbXMhkpIruAO1V1IXAt8NsE9QuBu0SkBYgDXwiCDYBbaQ3fXuo/AAuBX4pIGVDu88U7r7uB\nFV7vrlBetwOLReQeYLXPwzC6RUyVNfuaeGZzA6v2Nh1ZNmF8YSYXTy7ggxPzKMrPJHJ6MaWle1Jq\nq2GkMz0ZNXddO/Ibksh+D/y+Hf2VwFHjOKraCFzdTppFwKIk8i3AOR3ZbRidUdUY48cvlPHwy4c4\nUO+i3rIy4PzxeXz5o2fCoS0Wam0Y7wGbWcEwusj2qhae3FjH33c0ElX3vs+oQZnMnZLPxZMLGJqb\nQWTScEoPb02xpYbRtzBHZBgdoKr8fdMhvve3ctbsbx37ufSUUcwa0cLMMTlkWOvHMN4X5ogMIwnR\nuPLqzkaefLeOrZX7AcjNFC6ZnM9l0wq47INn2/s+hnGMMEdkGCGaYsqyLfUs2VjHoYY4ACMH5/Lh\nSdnMPaGAwpyMFFtoGP0Pc0SGAbTElIdf3cZ//fUgFY3OAY0rzOTjJw3iK1ecz/q31qTYQsPov5gj\nMgY0LXHlha0NPP52LYcbXBfc5GFZfGr6YErG5pIhQl52Zie5GIbxfjBHZAxIYnHl+a31PLahloP1\nrgV08phCPj4lk3PG5lr4tWH0IuaIjAHHm/ub+LcfvMw7+2oA9wLqNacW8sWPn8/q1atSbJ1hDDzM\nERkDhl3VUR55q4bSvW6105EFGXx6RiEfmJhHpggZGdYKMoxUYI7I6PdUN8V5dH0tf91ST1zdBKT/\nfOmJzCyoJNfmfzOMlGOOyOi3xFT55fLt3Lf0IHUtSgYwZ0o+15w6mEsumGrvARlGmmCOyOiXbKlo\nYcGqajaVu0i4M0bncMMZhUwcmp1iywzDSMQckdGvaGiJs3h9LU9vqicOjB6Sy2dPzefccRYJZxjp\nijkio9/w0rsHue2ZQxxuiJMBXD6tgPv+cTYb172ZatMMw+gAc0RGn6ehJc5Db9bw7NY3AJhalM3n\nI0OYUpTN4Fy7xQ0j3bF/qdGnWXugiR+vqOJgfZyczAw+Nb2Aj584iEwLxTaMPkOPzeAoIotE5ICI\nrAvJvi0iu0Vkjf98NHTsDhEpE5GNIjI3JI+IyFp/7AfiO/pFJFdEHvXy10VkUijNfBHZ5D/zQ/LJ\nXrfMp83pqfM3epbmmLJwdTXffqmCg/VxTijK4qkvf4BPnDzYnJBh9DF6cirhh4B5SeQPqOpM/3ka\nQESm45b6PtWn+YmIBBN8PQjcDEzznyDPm4AKVZ0KPADc7/MaDtwJzMKtxnqniBT5NPf78qcCFT4P\no49RdqCGrz93mKfL6skSuO7UwXznQyM4cXRhqk0zDKMb9JgjUtW/AeVdVL8CWKyqTaq6FSgDzhGR\nYmCIqi5XVQUeAa4MpXnYbz8OXOJbS3OBZaparqoVwDJgnj/2Ia+LTxvkZfQBVJVnt9bzsR++wvaq\nKMWDM/mPS0Zw1fTBZFkryDD6LOLq9w4URAYBDaoaF5ETgZOBpara0mnmrrvsKVWd4fe/DdwIVAEr\nga+paoWI/AhYrqq/8noLgaXANuA+Vb3Uy2cDt6vq5b7Lb56q7vLHNuNaQTcAeap6j5d/E2jAtdCW\n+9YQIjLBn8eMdmy/BbgFoLi4OLJkyZLOTjcp9fX1R7YLCgra7LcnM92j5c0xZdHaRl7Z7W67C8Zl\nM39GHvlZ0mEe6Xpu6WqX6ZpuMt3uUlJSUqqqJZ3pdSVY4W/AbN+99VdgBXAN8Jlu2PUgcDeg/vv7\nwD91I58eR1UXAAsASkpKNBKJdCuf8Nv7kUjkqLf5k8lMt6186d/e4LvLK9hSGSU3U/iPfzid43V/\n2trbFd10tct0TTeZbk/Tla45UdV64JPAT1T1atxYzntGVferakxV48DPcGM4ALuBCSHV8V62228n\nytukEZEsYChwuIO8DgPDvG5iXkaa8urmQ9z+7CG2VEYZMyiT+y4ZzifPGt95QsMw+gxdckQich6u\nBfRnL+vWSmF+zCfgE0AQUfckcK2PhJuMC0p4Q1X3AtUicq4f47keeCKUJoiIuwp43o8jPQPMEZEi\n34qbAzzjj73gdfFpg7yMNENVWfJuHZ9d+AbVzcrM0Tncf+kIm6LHMPohXema+wpwB/BHVV0vIlNw\nFXqHiMhvgYuAkSKyCxfJdpGIzMR1zW0DPg/g830M2ABEgS+qasxndStufCcfN2601MsXAr8UkTJc\nUMS1Pq9yEbkb14UIcJeqBkETtwOLReQeYLXPw0gzonFlwapqntvaAMAnTx7EtTMGk2lT9BhGv6Qr\njmi0qn482FHVLSLycmeJVPW6JOJ2K35VvRe4N4l8JXBUQIGqNgJXt5PXImBREvkWWrsDjTSkIRrn\n+69VsnpfMzmZ8MA1ZzGmZU+qzTIMowfpStfcHV2UGcb7orIxxp0vlrN6XzNDcoR//+BwLju9uPOE\nhmH0adptEYnIR4CPAuNE5AehQ0Nw3WeGcczYfLCWO54v50BdjDGDMvm32UWMLbQZqAxjINDRP30P\n7l2fjwPhmL4a4H/3pFHGwGJTeTP3/flVKutjTC3K5l8/MIyhed2KhzEMow/SriNS1TeBN0XkN115\nedUwusOGg8185+8VNESVSHEut507lLysnpx5yjCMdKMrfR/n+BkRjvf6AqiqTulJw4z+z5v7m7j/\nlUqaYsrHzxjLZ6bGbKoewxiAdMURLcR1xZUCsU50DaNLlO5t5LuvVtIShw9NyueBa2ayZvWqVJtl\nGEYK6IojqlLVpZ2rGUbXWLp2L//vlUqiCnNPyOdzZw6xpRsMYwDTFUf0goh8F/gD0BQIVdUeX433\nzGu7Gnng9dXEFD52YgHzTy9E7EVVwxjQdMURzfLf4RlUFbekgmF0mdK9jTywvJKYutkSPj1jsDkh\nwzA6d0SqenFvGGL0b97c38R3X3VO6ObZk5k7ut6ckGEYQBcckYh8K5lcVe869uYY/ZG3DzVz/ysu\nMGHuCfn860dPYdUq69k1DMPRla65utB2HnA58HbPmGP0N7ZXtfCdlytoiikXT3KBCdYSMgwjTFe6\n5r4f3heR7+GWWjCMDtlT2cA9L1dQH1XOHZfL/yoZQoY5IcMwEujOK+wFtF2szjCOoq45zg2/eIPy\nhjinjMzmK7OG2TIOhmEkpStjRGtxUXLgFsQ7DrDxIaNdWmLK/a9W8u7BZsYXZnL7BUXkZJoTMgwj\nOV0ZI7o8tB0F9quqzb5tJCWuyg9XVLH+YDPHFebyb7OHUJhjc8cZhtE+ndYQqrodGAZ8DLe89/Su\nZCwii0TkgIisC8m+KyLviMhbIvJHERnm5ZNEpEFE1vjPT0NpIiKyVkTKROQHfslw/LLij3r56yIy\nKZRmvohs8p/5Iflkr1vm0+Z05VyMrvPrtbW8srORvCzhoRvPZtQgm0XbMIyO6dQRichXgF8Do/zn\n1yLyz13I+yFgXoJsGTBDVU8H3qXtAnubVXWm/3whJH8QuBmY5j9BnjcBFao6FXgAuN/bOxy3LPks\n3Gqsd4pIkU9zP/CAT1Ph8zCOEa/sbOBPG+vIFPiX84Zx6tihqTbJMIw+QFf6TG4CZqnqt1T1W8C5\nOMfQIar6N6A8QfbXULfecjoJehCRYmCIqi5XVQUeAa70h68AHvbbjwOX+NbSXGCZqparagXO+c3z\nxz7kdfFpg7yM98m7+2v4yYpqAOafUcjMMbkptsgwjL6CuPq9AwUXrHC2qjb6/Txghaqe1mnmrrvs\nKVWdkeTYEuBRVf2V11sPbAKqgG+o6ssiUgLcp6qX+jSzgdtV9XLf5TdPVXf5Y5txraAbgDxVvcfL\nvwk04Fpoy31rCBGZACxNZps/fgtwC0BxcXFkyZIlnZ1uUurr649sFxQUtNlvT9bXdOtblG+/Ws/e\n2hjnj83mCzPzEJG0tTcddNPVLtM13WS63aWkpKRUVUs60+tKsMIvgNdF5I9+/0rc0hDdRkT+DRf4\n8Gsv2gtMVNXDIhIB/iQip76fMt4vqroAWABQUlKikUikW/mUlrYubhuJRNrstyfrS7pxVX6wupK9\ntTGOH5rFl2YNJzdL0tbedNFNV7tM13ST6fY0XQlW+E/gRlw3Wzlwo6r+V3cLFJEbcJF4n/Hdbahq\nk6oe9tulwGbgRGA3bbvvxnsZ/nuCzzMLGAocDssT0hwGhnndxLyMbvLExjpW7GliSF4W//f8YUec\nkGEYRldp1xGJyNki8hFwSz6o6g9U9QdAsW+1vGdEZB7wf4GPq2p9SH6ciGT67Sm4oIQtqroXqBaR\nc/0Yz/XAEz7Zk0AQEXcV8Lx3bM8Ac0SkyAcpzAGe8cde8Lr4tEFeRjd451Azv1lXC8AD18xkzOCu\nNLANwzDa0lGL6H5gQxL5euC7nWUsIr8FXgNOEpFdInIT8COgEFiWEKZ9IfCWiKzBBRN8QVWDQIdb\ngZ8DZbiWUrBI30JghIiUAbcBXwfw6e4GVvjPXaG8bgdu82lG8D67GAcyNc1xHlheSVzh4ycWcMkp\no1NtkmEYfZSOHmEL/TtEbVDV7SIysrOMVfW6JOKkFb+q/h74fTvHVgJHBRT44Imr20mzCFiURL4F\nF9JtvA9UlR+vqOJQQ5xpw7P59GmFqTbJMIw+TEctoqIOjnU/jMLo8zxdVs+KPU0UZAu3nTuUbFvm\n2zCM90FHjuhZEbk3mMkAQBx3Ac/3vGlGOvLWrkoeebMGgFtLhjJqkI0LGYbx/uioFvkafmzGj90A\nnAGsBD7X04YZ6Ud9S5xv/HY1UYV5JxRw3vi8VJtkGEY/oF1HpKp1wHU+ii14p2e9H2cxBiALV1ez\n/XAjk4dlMf8MGxcyDOPY0JWF8bYA5nwGOK/vbuTF7Y3kZmXw1VnDbFkHwzCOGTY/v9EplY0xfrqy\nCoCvf+Rkxg+xcSHDMI4d5oiMDlFVHlxZTXWzctqoHOafNynVJhmG0c/okiMSkQ+IyI1++zgRmdyz\nZhnpwnPbGli514Vqf+nsoWRYqLZhGMeYrqxHdCduRoJg7aBs4Fc9aZSRHuyubOAXa1yo9ufOHMLI\nAlvkzjCMY09XWkSfAD4O1AGo6h7cND1GP0ZV+bc/rqUxqswal8uFEy1U2zCMnqEro87NqqoiogAi\nMqiHbTLSgL/taOTFjVUMyhZuPnMIofeaDcMwjildaRE9JiL/g1tC4WbgWeBnPWuWkUqqGmMsWuNW\nW73hjEKK8q1LzjCMnqMr7xF9T0Q+DFQDJwHfUtVlPW6ZkTIWrqmhtln5wNSRXDzJnJBhGD1Lp45I\nRG7DLeltzmcAsGJPI6/sbCQ3U/iPT57Gga1vp9okwzD6OV3pmisE/ioiL4vIl0TEFp7ppzRE4/x8\nleuS+/SMwUwYbpOsG4bR83RlqfB/V9VTgS8CxcBLIvJsZ+lEZJGIHBCRdSHZcBFZJiKb/HdR6Ngd\nIlImIhtFZG5IHhGRtf7YD4LZwEUkV0Qe9fLXRWRSKM18X8YmEZkfkk/2umU+bU6nV2gA8fiGOg41\nxDmhKIuPTDMnZBhG7/BeZlY4AOwDDgOjuqD/EDAvQfZ14DlVnQY85/cRkenAtbjJVecBPwmWDgce\nBG7GLR8+LZTnTUCFqk4FHsCtKIuIDAfuBGbhFsG7M+Tw7gce8GkqfB4GsLM6ypJ36xDg5rOGkGlR\ncoZh9BJdeaH1VhF5Eec4RgA3q+rpnaVT1b8B5QniK4CH/fbDwJUh+WJVbVLVrbhlwc8RkWJgiKou\nV1UFHklIE+T1OHCJby3NBZaparmqVgDLgHn+2Ie8bmL5AxpV5WerqokpfHhKPtOGW0PRMIzeoyvv\nEU0AvqqqazrV7JzRqrrXb+8DgvGmccDykN4uL2vx24nyIM1OAFWNikgVzlEekSekGQFUqmo0SV4D\nmifW7GH9wWaG5Igt+20YRq8jrqGR5IDIEFWt9l1dR6Gqia2dZHlMAp5S1Rl+v1JVh4WOV6hqkYj8\nCFiuqr/y8oXAUmAbcJ+qXurls4HbVfVyP/Y0T1V3+WObcd1xNwB5qnqPl38TaMB1FS733XKIyARg\naWBbEttvAW4BKC4ujixZsqSz001KfX39ke2CgoI2++3JelO3vkW5/aU6Kpvi3Hx6HhdOyOkwj1Tb\n219009USCgMWAAAgAElEQVQu0zXdZLrdpaSkpFRVSzrT66hF9BvgcqAUUCA8aKDAlG7YtV9EilV1\nr+92O+Dlu3Etr4DxXrbbbyfKw2l2iUgWMBQ3frUbuCghzYv+2DARyfKtonBeR6GqC4AFACUlJRqJ\nRN7zyQKUlpYe2Y5EIm3225P1pu7v3qymsinOSSOymXPiUDJCY0PpaG9/0U1Xu0zXdJPp9jTtjhGp\n6uX+e7KqTvHfwac7TgjgSSCIYpsPPBGSX+sj4SbjghLe8N141SJyrh/juT4hTZDXVcDzfhzpGWCO\niBT5IIU5wDP+2AteN7H8Acne2ihPb6pHxE1qmmEBCoZhpICuBCs81xVZEp3fAq8BJ4nILhG5CbgP\n+LCIbAIu9fuo6nrgMWAD8Bfgi6oa81ndCvwcF8CwGddlB7AQGCEiZcBt+Ag832V4N7DCf+4KdSPe\nDtzm04zweQxYHnmzhqjCVWeNZ0pRdqrNMQxjgNJu15yI5AEFwEjfsggel4fQhUF+Vb2unUOXtKN/\nL3BvEvlK4KhxHFVtBK5uJ69FwKIk8i24kO4Bz9oDTbyxp4m8TOFf5p7Ezk3rU22SYRgDlI7GiD4P\nfBUYixsnChxRNfCjHrbL6EFiqjz8pltn6BMnD2LUkLw2YYaGYRi9SbuOSFX/G/hvEflnVf1hL9pk\n9DAvbmtga2WUkfkZfOwkW9XDMIzU0pXZt38oIjOA6UBeSP5ITxpm9Az1zVF+s64WgH88vZDcTAtQ\nMAwjtXRl9u07ceHQ04GngY8Af8fNcmD0MX7xyjYqG+NMG57NBybYqquGYaSersw1dxUuwGCfqt4I\nnIF7Z8foY9Q2x/npS5sB+Mxpg23VVcMw0oKuOKIGVY0DUREZgnsJdUInaYw05E8b66hpjHL6qBxO\nG5WbanMMwzCArs01t1JEhuGWBy8FanHvBxl9iIrGGH/eVAfAp08bnGJrDMMwWulKsMKtfvOnIvIX\n3GzYb/WsWcax5vENdTTHYO6po5k23LrkDMNIHzp6ofWsjo6p6qqeMck41uyvi/LslnoE+Nqck6jZ\n9W6qTTIMwzhCRy2i73dwTHFr+xh9gMfW1xJVuOj4PE4cXUjprs7TGIZh9BYdvdB6cW8aYvQMO6pa\neGl7I1kCnzrVxoYMw0g/uvIe0fXJ5PZCa99g8fpaFLh0SgGjB3UlNsUwDKN36UrNdHZoOw/3TtEq\n7IXWtGdTeTOv724iJxOuOsWm8jEMIz3pStTcP4f3fSj34h6zyDhm/Gatm8rnsmmDKMrPTLE1hmEY\nyenKC62J1AGTj7UhxrFl+ZbDvHWgmYJs4Uqb2NQwjDSmK2NES3BRcuAc13TcInZGGvPfz24C4GMn\nDmJwTneeNwzDMHqHrowRfS+0HQW2q2q3A4BF5CTg0ZBoCvAtYBhwM3DQy/9VVZ/2ae4AbgJiwJdV\n9RkvjwAPAfm4CVm/oqoqIrm4MawIcBi4RlW3+TTzgW/4Mu5R1Ye7ey7pyoaDzby2pZyCbOGyqQWp\nNscwDKNDujJG9BKAn2cuy28PDy2//Z5Q1Y3ATJ9PJrAb+CNwI/CAqoYdHyIyHbgWOBW3SN+zInKi\nX0r8QZzzeh3niObhlhK/CahQ1akici1wP3CNiAwH7gRKcK28UhF5UlUrunMu6cpjG9zY0OXTChhk\nrSHDMNKcTmspEblFRPYBbwErcfPNrTxG5V8CbFbV7R3oXAEsVtUmVd0KlAHniEgxbrqh5aqquBbQ\nlaE0QUvnceAScVNNzwWWqWq5dz7LcM6r3/DOoWbWHmimMDeLy6bZ2JBhGOlPVx6X/wWYoaqTVHWK\nqk5W1SnHqPxrgd+G9v9ZRN4SkUUiUuRl46DNSta7vGyc306Ut0mjqlGgChjRQV79hqA1dOMFk2xs\nyDCMPoG4xkQHCm6i00+qav0xLVgkB9gDnKqq+0VkNHAI12V2N1Csqv8kIj8Clqvqr3y6hbjut23A\nfap6qZfPBm5X1ctFZB0wLxjLEpHNwCzgBiBPVe/x8m/ilrlo0x3oj90C3AJQXFwcWbJkSbfOs76+\n9bIVFBS02W9P1l3dd8uj3P1aPXlZ8NPLRpEZbTwm+ZrusddNV7tM13ST6XaXkpKSUlUt6UyvK8EK\ndwCvisjrQFMgVNUvd9s6x0eAVaq63+e3PzggIj8DnvK7u2m7/tF4L9vttxPl4TS7RCQLt5DfYS+/\nKCHNi8mMU9UFwAKAkpISjUQi7/X8ACgtLT2yHYlE2uy3J+uu7pKVbtjusmmDuOi8s49ZvqZ77HXT\n1S7TNd1kuj1NV/pu/gd4HliOGx8KPu+X6wh1y/kxn4BPAOv89pPAtSKSKyKTgWnAG6q6F6gWkXP9\n+M/1wBOhNPP99lXA834c6RlgjogU+a6/OV7W59lS0cKa/c3kZQofs7EhwzD6EF1pEWWr6m3HslAR\nGQR8GPh8SPz/RGQmrmtuW3BMVdeLyGPABlz4+Bd9xBzArbSGby/1H4CFwC9FpAwox41FoarlInI3\nsMLr3dXd6L90408b3aJ3l07JpzDXxoYMw+g7dMURLfXjJUto2zXX7QpcVetwwQNh2Wc70L8XuDeJ\nfCUwI4m8Ebi6nbwWAYveo8lpzY7D9by2s5FMcS+wGoZh9CW64oiu8993hGSKexHVSAN+9vIW4sBF\nE/MYWWBzyhmG0bfoygutNq9cGlPVFOexlS4i/QqbU84wjD6IrUfUx3l6Ux1N0TiR4lwmDs1OtTmG\nYRjvGVuPqA/TEI3zlzIX9/+Jk601ZBhG38TWI+rDPLelgdoW5ayJwzh5hLWGDMPom9h6RH2UaFxZ\n8q4L2f7CB0/AvUplGIbR97D1iPoof9/RyKGGOOMLM7n0lNGsXr2780SGYRhpSK+vR2S8f1SVJ3xr\n6IqTBpGRYa0hwzD6Lu06IhGZCowO1iMKyS8QkVxV3dzj1hlJWXewmR1VUYblZjB7Yn6qzTEMw3hf\ndDRG9F9AdRJ5tT9mpIin3nWRcvOmFpCdaa0hwzD6Nh05otGqujZR6GWTeswio0P21EQp3dtEdgbM\nmWKtIcMw+j4dOaJhHRyzGjBFPF1WjwKzJ+YzNM+m8zEMo+/TkSNaKSI3JwpF5HMcm2UgjPdIVUML\nL2xtAODyE7u/WJVhGEY60VHU3FeBP4rIZ2h1PCVADm69IKOXeWzFThpjymmjcjjepvMxDKOf0K4j\n8iumni8iF9O61MKfVfX5XrHMaEMsrjz06jYALp9mrSHDMPoPXZni5wXghV6wxeiAN/Y0sbuygTGD\nMzmrODfV5hiGYRwzUrKUp4hsE5G1IrJGRFZ62XARWSYim/x3UUj/DhEpE5GNIjI3JI/4fMpE5Ad+\nyXD8suKPevnrIjIplGa+L2OTiMynj/CUf4H1smkFZNh0PoZh9CNSuab0xao6U1VL/P7XgedUdRrw\nnN9HRKbjlvo+FZgH/EREgnCxB4GbgWn+M8/LbwIqVHUq8ABwv89rOHAnMAs4B7gz7PDSlbLyFt45\n3EJhXhYXT7KARcMw+hepdESJXAE87LcfBq4MyRerapOqbgXKgHNEpBgYoqrLVVVxy1JcmSSvx4FL\nfGtpLrBMVctVtQJYRqvzSlv+vMm1hq47ZyL5Wen0kxmGYbx/xNXhvVyoyFagCogB/6OqC0SkUlWH\n+eOCa9EME5EfActV9Vf+2EJgKbANuE9VL/Xy2cDtqnq5iKwD5gVz4onIZlwr6AYgT1Xv8fJvAg2q\nGp5PL7DxFuAWgOLi4siSJUu6da719fVHtgsKCtrstycLyysa4/zv52uJK/zko8cxWJra1X0v+Zpu\nanXT1S7TNd1kut2lpKSkNNTr1S5dmfS0J/iAqu4WkVHAMhF5J3xQVVVEet9DtrVhAbAAoKSkRCOR\nSLfyKS1tfeUqEom02W9PFpb/cXMNMYXzxufxkQvP6VD3veRruqnVTVe7TNd0k+n2NCnp51HV3f77\nAPBH3HjNft/dhv8+4NV3AxNCycd72W6/nShvk0ZEsoChwOEO8kpLmqLKsi3uCcVCtg3D6K/0uiMS\nkUEiUhhsA3OAdcCTQBDFNh94wm8/CVzrI+Em44IS3lDVvUC1iJzru/KuT0gT5HUV8LwfR3oGmCMi\nRT5IYY6XpSV/29FATbMytSibk2wFVsMw+imp6JobjZuxISj/N6r6FxFZATwmIjcB24FPAajqehF5\nDNiAWw/pi6oa83ndCjyEm/tuqf8ALAR+KSJlQDku6g5VLReRu4EVXu8uVS3vyZPtLqrKU5t8a+jE\nAluB1TCMfkuvOyJV3QKckUR+GLiknTT3Avcmka+kddaHsLwRuLqdvBYBi96b1b3Pa5sPs6s6yvC8\nDM4bn5dqcwzDMHoMiwVOUx55bTsAHz6hgCxbgdUwjH5MqqLmjA44XB9j2duHyBT48GR7gdUwjP6N\ntYjSkL9uqScWV84dn0dRvq05ZBhG/8YcUZrRElee3eLWHJp7goVsG4bR/zFHlGYs39VIZVOck0YX\nMn2khWwbhtH/MUeUZjyz2YVsf/a84y1k2zCMAYE5ojRiW2ULbx9qoSBL+MSZ41JtjmEYRq9gjiiN\n+ItvDV00KZ9BuRbQaBjGwMAcUZpQ1xznb9sbAQtSMAxjYGGOKE14YXsDTTHltFE5jB9irSHDMAYO\n5ojSgLgqz5S5brl51hoyDGOAYY4oDVh7oJk9tTFG5Gdw9tjcVJtjGIbRq5gjSgP+4ltDc6YUkGnz\nyhmGMcAwR5Ri9lQ2sHJPE1kCl06xeeUMwxh4mCNKMY+u2EkcmDU+j2F5Nq+cYRgDD3NEKSSmymMr\ndwLwYWsNGYYxQEnFUuETROQFEdkgIutF5Cte/m0R2S0ia/zno6E0d4hImYhsFJG5IXlERNb6Yz/w\nS4bjlxV/1MtfF5FJoTTzRWST/8wnhazZ18TeqkbGDM5kxnE5qTTFMAwjZaTihZUo8DVVXSUihUCp\niCzzxx5Q1e+FlUVkOm6p71OBscCzInKiXy78QeBm4HXgaWAebrnwm4AKVZ0qItcC9wPXiMhw4E6g\nBFBf9pOqWtHD55yUZX6W7Q9Pzrd55QzDGLD0eotIVfeq6iq/XQO8DXQ0sdoVwGJVbVLVrUAZcI6I\nFANDVHW5qirwCHBlKM3Dfvtx4BLfWpoLLFPVcu98luGcV69zuCFG6Z4msjOFiydZt5xhGAMXcXV4\nigp3XWZ/A2YAtwE3AlXASlyrqUJEfgQsV9Vf+TQLca2ebcB9qnqpl88GblfVy0VkHTBPVXf5Y5uB\nWcANQJ6q3uPl3wQaElth/tgtwC0AxcXFkSVLlnTrHOvr649sFxQUHNn/06Ymfv9uE+eNz+PWM47u\nlgvrdiQz3b6pm652ma7pJtPtLiUlJaWqWtKZXsrmkhGRwcDvga+qarWIPAjcjesyuxv4PvBPqbJP\nVRcACwBKSko0Eol0K5/S0tIj25FIhNLSUmKqvLSrFoBb555OQfWOo9IFup3JTLdv6qarXaZrusl0\ne5qURM2JSDbOCf1aVf8AoKr7VTWmqnHgZ8A5Xn03MCGUfLyX7fbbifI2aUQkCxgKHO4gr17lrf3N\nHKqPM2pQJhecMLK3izcMw0grUhE1J8BC4G1V/c+QvDik9glgnd9+ErjWR8JNBqYBb6jqXqBaRM71\neV4PPBFKE0TEXQU878eRngHmiEiRiBQBc7ysV1m2xTV/Pzw5nwybScEwjAFOKrrmLgA+C6wVkTVe\n9q/AdSIyE9c1tw34PICqrheRx4ANuIi7L/qIOYBbgYeAfNy40VIvXwj8UkTKgHJc1B2qWi4idwMr\nvN5dqlreQ+eZlIqGGCv2NJEpcPFkC1IwDMPodUekqn8HkjUDnu4gzb3AvUnkK3GBDonyRuDqdvJa\nBCzqqr3Hmue3NRBXmDUulyKbScEwDMNmVuhN4nHl2a3+3aEpttyDYRgGmCPqVV7ZfIgDdTGOK8jg\n9NE2k4JhGAaYI+pVfvuGC9O+dHIBmTaTgmEYBpDC94gGGpWNMf66/hAZWJCCYRhGGGsR9RIvbW8k\nGlfOKs5lRL4FKRiGYQSYI+oFVJUXtrkghQ9Za8gwDKMN5oh6gc0VUXZWRxk+KIezinNTbY5hGEZa\nYY6oFwhaQ1fOHEe2zaRgGIbRBnNEPUxzTPn7DueIroqM70TbMAxj4GGOqIdZsaeR2hZl8rAspo8d\nkmpzDMMw0g5zRD3MS9sbAWzxO8MwjHYwR9SD1DdHeWt/EwDnT8hLsTWGYRjpiTmiHuTlTYdoicO0\n4dk2walhGEY7mCPqQZ57ez8AJWMtZNswDKM9zBH1EPG48vw7BwA42xyRYRhGuwxIRyQi80Rko4iU\nicjXe6KM6sYWzpk8nIlDs5g4xKb0MwzDaI8BV0OKSCbwY+DDwC5ghYg8qaobjmU5wwpy+MlnIqxc\nuRKxmbYNwzDaZSC2iM4BylR1i6o2A4uBK3qqMHNChmEYHTMQHdE4YGdof5eXGYZhGClAVDXVNvQq\nInIVME9VP+f3PwvMUtUvJejdAtwCUFxcHFmyZEm3yquvrz+yXVBQ0Ga/PZnp9n/ddLXLdE03mW53\nKSkpKVXVks70BtwYEbAbmBDaH+9lbVDVBcACgJKSEo1EIt0qrLS09Mh2JBJps9+ezHT7v2662mW6\npptMt6cZiF1zK4BpIjJZRHKAa4EnU2yTYRjGgGXAtYhUNSoiXwKeATKBRaq6PsVmGYZhDFgGnCMC\nUNWngadTbYdhGIYxMLvmDMMwjDRiwEXNdQcROQhs72bykaHtQwn77clMt//rpqtdpmu6yXS7y/Gq\nelxnSuaIehgRWRlsq2pJeL89men2f910tct0TTeZbqLsWGNdc4ZhGEZKMUdkGIZhpJQBGTXXyyzo\nZL89men2f910tct0TbdXsTEiwzAMI6VY15xhGIaRUswRGYZhGCnFHJFhGIaRUixYoQcQkZOBDwEn\nAd8DTgPuAfKAvwBDcS+ORYGfq+pSn24W8LaqVotIPvB14CxgA/AdVa3qpNxRqnpAREao6uGEY4+o\n6vXvNV0yXSDWmd77yV9VD/jtTvW7k6ar+mG9kKxLNqUaEclW1ZYE2UhVPZQgK8HNRh8D3lXVd7py\nLKQzEagBqoCJwLle/zlVLQ/pCW5RyvNxL4fvBt7Q0CC1n4S4BThJVd8RkYtpvf9fBqZ7OyrD5wQU\nANWqWikik4AS4B1VXZdg6+mq+lY71ysHaAns8WXf6G1dHdgL/C9V/UmS9CcnXh8RuTVRV0QG4+qF\nLUClvx7nAsFL8y+ratzbc4M/vhM4FVgFFAKPA3OAH/tzPh+Y7dMP9ee/CRgFrPfXbx8w2n9O8vJ3\n/fUdDvwD8BbwkqouCV2D2cCKoI7qKSxY4RgjIl8GvoVrbRbh/liBwxfcW8ox3I8f3Dj/qar3ich6\n4Aw/MesjQDnupvsH4KM+/QQgB2gAXsTdWGfhKoICf0wA9d9h4kAjcIFPOzhkVww3CWy4ldwIVAPf\nx81SXgRMSshbgSafLsvbsQsY68/t58DN/nhws4XLqMX9CeuAyThn3eTPJe4/mUnKO+A/o/01afb5\nJuqqPxbon+avUTOQ7c87O6QfC9kXBR4BrvbnEuiA+113AD8F/o6rJPJwv9Mu3G9y0NtyDlDsy30H\neApY7sttBu7CVeY7/fU93p/jJlp/p3G+7Gm4ivFtfy5nAq8Cn/Xpgt9XcTPN/8DbuQdYBtwEnIKr\nyGZ7G08Htnr747gK6hzgMK4yWwsM8jZ+w2+PBc7AVZZDvL3Nvtwsf+0+C3zSl3+LP7+zvY21/hre\n7s+twW//D/Alby/A68CFwIn+t4kBXwB+A8wA/uqvcQbwQ+Dz/vrOBh4FluAq82zgfn9OrwIvACu9\nLefiHhTnqOobIvIzYD7uXmr259YE5Pqy/t3neR0wC3cPPgL8ytuJtzOC+4+v9de5wKcp8PY2+msV\n99c+YAPuPggWAgruucT/c6W/ttPpuFFRjfuN4r7cKFCPc2rJ6oo9wC9wD9OTcE5rpare0UEZ7w9V\ntc8x/OBuuvW4m3iD/5HLcTdm1O8HN1Cc1srvKa/XhLt5A70YrRVyoN9CayWb+ImHthuBpbhKJqiQ\n29MNPtt8ubUJ8lgS/WZc5RT3dh9KkncstB/3tjfjKu72ziH4NPg09d6u8iQ6sST7O0LldHR9Ansq\nEq5Pc4JOkKY+tB/F/cHLfR7JrmWijYm/WxzY2MXfMppEN9bOd7Ad3q/y5R9MOKeGdq6j4u6BsHx1\nkvNMvMYNXhYuI45zALEEeeJ1iSeUF05fFzoWVKTBPbo/IU1zKG1Tgn1bfPrgvxi2PxbSD8pqTLAj\nuJYd3YMayj/Zb/92O2njtLW3hrb3VuJvF64PEuuIXQn2B+VXJ5R3OCG/wLYY7qFjPc7JvdWj9Waq\nK+7+9ME1bRv9n/EtWivS4GYLfuTgz9qS5MZIrIAS95twDi6acLzeb5cBP0nQrwndXGWhY2+F0jX7\n4/tCN3C4jJrQeTTgKqXNCXZuTPhzhCu6Mlor/EBelpA+XOmEj0W9vMbbFdZNvMZxXDdKoHOgA/1w\nmnDlXZfwWzR4+Rq/XxU6ryCvoAJMPJ+gjDdprRDCv1fYtpZQ/uH0yZx24AyD/Q24CjmoTIP7KtpO\n2jjwUIKNwYNEI64lFT4W/u2aQ3kncx7RhLKb/SfYD1+7oGUU5FUayuupUJrEijaWUF7i/6E6IV0L\nbR1Z4vVTYG/CuTT6axrYGS4/OB7+nwRlBv/tQO9xfyxoWUVDeb1L6/2lwP9JOMcGYF3CdQzyrqD1\n/gzKCo6totUJxWl9iDiYYNtqWu+7GK6VucnvP+3Lew1Y15N1pwUrHFtG4yr37bjunM3A3bROGniy\n/87DNYWzaG0FgbsxEglXbvg0/0WrU2vxeeX640Wh7WRP6e/gWjzgbrqgSR/cC9tobb2tC9mWE0on\nuO6UrV4Xb8ertD7F7ffysF11fjvqz/umUP7NobzLvX6Qd7Xf3u9tCj9lBtcy/Ec86HWagT93oB9U\nClU4x9AUkgcVAbR2mwTdLkNwjiVwXCtD+Qdlgbv+wfU8xZe7yx8LrnFAUEEGXYPBsQxcF0o0JAt0\n3wqVdTIwzKcXWlvTmbTeB+DuSbzOR0Plx2ntDmvCdb8F5SutzrHFf8BVzsG1jOO6uwKdBlrvqQz/\nqfZ6QTeQAPd5O9/y5c8M2TTX2x+ccy2ucg3OMTinoEs27IgkdN5x3H0ePDAEY1eZtLau8HY3hs4v\nCzjO250fOufgoa3alxM4orJQOg3l80Hcb5hD628XHCv09gf23u2PrfPyPNz4EP54Nq7Fj89vG633\nRdiJnuGPB9fqDH9uI0JlgbvegU4j8JJPVwtc4uVFwHfoQWyM6BgiIgtxT3GlqrpDRE5U1XdF5FRg\nEe7J6BPACbgfuAp3g5yKG2eI01pxC+5pPrh5B9P6hw4qLMU9uX6J1v7kBlrHiVpoO2bUgHuq/ztu\nPOF83FLp4fEbDe3vwTmuDFyfu9I6riIhvS04J3MCzhH9GTcecFvI7npvS2aojKD1UY57Er7CH9+M\nc3TBmEUDrRVj4Mx24loBc2itqAP7wq2JOlzFEOh/nNZKKagAanxZmaF0gc5zwKWhfPcBY0LXNnCq\no3BjEp+gdbwMWvvlgwpoIzDVl3UQN9YSJlx5iv88BPwT7oHmuJBekHdGSL8B50jy/HkN8/Jg7CwY\nc4PWyjI4Fth5GHdvnhDKN+xcg+sWOJTgflgKXIZ7gj7e25qJG7u5GVeZBy2RbFzLdbD/ZNE6npqB\ne6h5Gfga7gGkkdbAiUp/vYMHi4DA8Qa//3rc+EngEAV3PzR7W/Jw92WmLzcXt2DmBaFrdBh375zo\n02T58jO8/gLcuFSTP48gXXB9AweaBfwJ+BStTkBC6Zpx99Y63JjVu7gH2zjOeeQAz+P+J8fh7rPv\n4no/7gCu99dohb+2Y/z5VeICo+pwraQsfx4/wTn6a3H3yGjceORLuAfM6f76ngZ8QUMBIj2BOaJe\nRkSKcAOBF+FuiDrcH+0w7o93Oe4mEFyldTzuhmrGVTK5uCb5ApzzycLdeMGg5y+AT9PafTSa1gry\nEi/bivsjjcMNwjfjmuMZtD75VQJTgN2qer0PnsikNVAC4Ge4Qe15uG4NwQ02X4OrNPbh/lAn4yqO\nItxT5E5/Tufh/mS5/joET55B5ZmF+3Pt8LZm+GvyAi5I4BO4P9lwnNP+NW5g+2Tcn+oKWp+qgzGS\nMTjHVeFleNtyff5BN1GeP89Gb0ue/wRdMg2+zBpv30rgl/74H3AtoCBYoQDnrEb7MktxDyUvAw/Q\nei9s8GXU4SLQtvnf7gTcw0qdt3Uv8GPgMVzr7DVcJNRY/7v9HNfyivnruwj3sLLT23WhP8/f+d/4\nGr/fhKuY6/x3Gc4hbMBVtu/4ayO4Lp0bgI/56/as/5wALPZpf4q7Nxb787oA9yBU63/D3+Hu1a/4\n37HB2/0CLrp0A/BV3G++01/nz+FaUf+IC24ZjXtIasC1Ui/AVbL4a34Y9/DxSf+7fRv3gHW1T/ea\nv24TVfUUESn09ozB/S+j/vjJ/nf7HS6o4gRc8MOJuECUc3H3WS7uQW+G/60P+WsRRPeNxd3nW3CB\nJyP99XzQ5z3bX9f1uKi44MErKckiO7uiE4pkHZWor6GIUiCzs/yPBeaI0ggRudFvZgOjVPUeL7vU\nf4bT9sk23MUBrU9gidE1dbgWRVge/PCJuu+HoOxw6yp4ApcEneDpN+jaCJ6y63GVVgYd2xZLohO0\nZupx1zBoDQaf9s453GqRDmThP0vQ/VQJ3KCqL3Zga68hIkNxT8hX4pyW4hzmE8B9XXmyFZFluEr3\nk7inb8VVyk/gXjcoC1VkJ+Aq7jm0PsEHLfm5XneIt2k88LSq/taXMwr4tqre6veX4hzMr1T1IyF7\nRgCPquqlfv9ZVb3UyxeHbB2F+/2POl+vG8VFvM0LXZtDOId1nLcviKBsxjm+Pwb5+PNYjXPOp6nq\nkQRVAr0AAA1eSURBVG5EEfm5z+vKhOvwF6/ykYTfI1Ge769ZHc7hl+Oc3RifJhd3X4vXC6JLYzhH\nfQbuf9aAG9upwDm0oIUYhHMX+HPLwo1HfdPnl+Pzj/nyP+Dzz8c90L2moXD8Y405ojRCRHaE91V1\nopeNwd1I03BPycf///bOPcTu4orjn5PdNRs3yeahibEaHzUaq21MGq1asNRasUQokdAnWKGgoH0q\n7R9qH4QKlrZIUawUqkVsRfv0gUhKQUrT1kZqzEONqc+Yl26icbMmm+zu9I/vHGfuzTvZvYv0fGHZ\ne+fOb34zc+d3zpxzvnMuWjhQhO8AWqzusnN3zABF4O+srnN3zm6KO5Dq+lp5+C64dnFAca24n7tm\nRf0XWXZOe3WKtz/o/ZSHalfVL7dAxubXbg120OgO8nE5XHH4nHi9DUgwJAqt2C0eb8fH3UGJ8/iD\n7vd3IbEG0Z19HIcCDzDfiwTi/Smlz2QBXOPzyHI4thrjDrR7v6Dp3quRAHWLzWnv63J5ex6Hu31r\n16sLngmUOE0PsiL9Gg/IX4ssiyuRIO1GQuzW3I830e7+58i9dBdFePp36Rb3EGKXzqa4XjciK8L7\n5+VHUWJAfej7mVTN5742K82f+fteSkxoFaKTj2ua03coFGu30h9DruxplHXh8HVab8Q2I2/A+bnt\nen1up7j56vhv88ZnJ4UyXl+/tzE2b7L2NzcHg0Fk0fkRlPUppVOPoL39IhRRi2FmK5BC8QV2pPAv\ncAuymFyReCxnB1IWHj9wdKKFXr92957HVvzXGt011oYEhrvPXkPC0mM/zr4BKcwzKA+pnzPxs0s7\n0EM+jUal0p/vPYXyMPXle7yMdokeT+jI7Xh8zOdjGTrfAY3CwR/oARoVucdvXBG50HYl6oqoP38+\nnsJOcwXpVGIXohNz+32IvPIicom1UYRqN3LPXdTU/y15vmnqe33Gycu936mp3OM/HVVZHUuq197e\nhNyRoNmyrC3SA635vVml9WdQYisdlPM4HrNLlPXmY6/jYuS6Y5Ab8AS0pk9AZ5M+R+MGp3n9QFkP\nOyiubChK1+fZLfZ+JNTb0boequr4WvT2F6NziAM0xlP78nsfmzPu5uQ6q1DscQBtLBKFdDOOEqva\ngFyavta9fZCSngysTCnNATCzfnSI+MOMIII113pMRwt2GxLE16MF0EehWTtDyYXbAIUd4zRoF5j+\nkE+l7OD8AWxHLjk3qcchs3steoiep5AY1qLFPC/fp5tGFxu5zeOREhrK1xxFsbjcZdCJdruW7+kx\nlomU4HoX8q27X90F9NGUIHd7/j+xanMQPYBT0K56LdrJr0CZKN5GBwl9t9/svhxDUUJe7n2vx0BV\nzzcNnRSl/iqFkTaALIKePK76PFU3EhAL0Xc0CQmK4/LcXFrNWWcuO4E9lQUUJd/MvKq/H18zLjxr\nRqZbz06Bd0H0DBJQqfq7jcIa3IbiNt52Xy5zYsVbKPbi95tR9fkZGqnb/Yjx6X1/Dbm7nM6+u6q7\nMv/363dShOvLFKZlG7IA/fvaRbHCfK5q687X1UloPUyp+uoWst/vmfy+pujvrsr82dqV++SehAEU\n9yH3axpaG/W5IF/Lfj2U4wUec3UZ7YeA3U03lmLF9aFYVTtal72U9eqby61I8f6N4o3wufN2JuS6\n7Wb2HzO7LH++nBFGWEQtRmbWTa+KbkWpRKajxXUqCmSeih7YuWg3/U3gFuAm5BJx99WDuZ0taNG8\nigRZby7fjQLpF+T3LyCrog+RCWZX9d5GD8CZuf5l+d5/Ri6ZGi+mlG40swW5/53o0NuNeZxHowcd\nynkMf9+VXx+DBPKG/DeNYkm4K2eAwmpbk8dJZiM6nZqq7Pg8l5tzW4uQYlud2zgp378/l51M405y\nS+7LJvQ9nJb75bgit3MvsmSOR4Kmm7LzHqzGYUigfoDiUgJZbech4XVKVd6DvofTqrJNSLi7e9Gx\nJd+3LqvZgzW2U86bTKQI94l5rtry+I+mxHgmV9fvoFigvvnw3bzHU2blus2uVt9k1Od4duX5eDf/\ntSMl7/U7EAN1QX7/BsVKfgOtnbE0WrB1Rox2pODOodFC3IiINL4hqq3KZgtsA2KafQptBMcCS5GL\nciwS7lPQetiKSCLfzde6wu5E2SEuZE/L3GM17qIDbQanIwU4P1/vRxe+hsgrO/NcuXX+hzwfH0Vp\niPzM0kIKhfuRlNIiM7sGkSy6KBZdPeYhtEGdgL7/cUBXakoXNdwIRRQIHAIy6/EHKH3NJA7Nq+DW\n5zLgO0hJrkcUWcfTSDleWJUtQQJ1KoppOLYiuu1ncz+2U1K5/AO5J8cjobQAUcB7gU+jDUYnYuct\nAT6Z2/SzV19AGxqQQH0EMbomIwvI2V4eg1qHBPw7yDqcjRTI67neeGRFLUHM0Kn58/uRwJ2e23YB\nvga5druQIHVWXwdSlu6qrMk7NYHHrY6Oqu52ZDm/gjYG09BGqJvGmOhL+R6TkTIaQiSK2xF7cUP+\nfAZSYn9Clu2kfM9/o5RGv0abOigKz88fbc7jdtedp+FxF+PryFXoqaVA1PJTKK41d93dDXyZxs2H\n152V53or2uQ8ijaVx+TvYwpKNzUjz4mhzeqK3MaVaHNLSulxRgihiAKBYYKzHlNK91QMSPZVNkJ1\nL0E5wp5EzKffIuUG8PWU0ryKiXkxshCmAA9X9RZTXETHIeH1VyTovoV23jcgcsHlFIW0GwnFp4Fv\nAGNTSh8ys/uQVbGR4g7rQ8LPqfBfzdf2IKXkeQs3I7fTUchamAlsSyl9MI9jca57oPHua26m5rJu\nymHR6RQSj8daNyEFdj6ilZ+F4knu1ltMob7PrPpxCcoV+Xi+5+o8njfRBuTJPI9+eBWKYnXSkMMt\nvWaSgr+3YarrbsNFaNOli1OaxwghFFEgMExw1mPFdnwPeysbobrHAZtTSiea2Xq0830XWUJbUkpz\nKyam52i7CFkgXu/Y/PcsEoT7awMkjNfkem253iu5f3PNbFd1r3fRTr09l01CQn8tUnATEIHjbOTS\nPIXihvJzTpMRHfyG3M+egxjvvuamHVk47tp6muJ2HIPIAKtQxoPvUdIA9SIF6oe49zVnXSij/px8\nzxmUxMBnIMtjLsVqm4OU1UDVD2e/OhMW9mTKtlX/j6SuMyLbkPIdg5QmKaW5jBBCEQUCh4CK9QjD\nx3wMBBx1WiaPYQ1Rztc1xwidKVtn8jiSup7/0mNgrjg31OemhhvBmgsEDg3OerwCkQreQbGHmvXY\nRyMDrZc9s5mPVF3frfc2lTsrq6+pXmpqw3F70/X1b2HVbQyin1DY27287mDVhsdqnJ3mLigPhtes\ntbpdUGzKsb16fTDj3dfcDFR1f1aVOZ7K/+skqnUfDmbOhqp7em5Dv24bhWACiteArDCPebVR2K9W\n/dVMWRumuh25bheyTDuQdTei9O34YbxA4NDwKFJGvShIXTMgnfXobEfHsyj2cFlT2UjUfRHFWd7a\nSxt+WHhuVW/OXtq9A6WKWkCJW3gfNuW2vY3tKN3Ql5rq7kQ78LdQEH4mcr/1IKbh1tzGj9EJ/+dQ\nsPw5FEcarNpdX/W9uQ93IALHgca7r7nxsgmI2rww328mSld1D1JGy/M4foKSDtd92N+czUI07tMp\nLNQe5GacjVh5s/I8PohiSQ+gWNwtNLJiPeZEVeZM2TlN5YdbdwuZkp5SWmpmd9EChGsuEAgEAqOK\ncM0FAoFAYFQRiigQCAQCo4pQRIFAi2FmN5nZajNbYWbLzexjB77qsO/1hJnNH6n2A4HhQJAVAoEW\nwswuQIdA56WU+s3sGBpz3wUC/3cIiygQaC1moAOY/QAppZ6U0gYz+76ZLTOzVWb2SzMzeM+iuc3M\nnjKz58zsXDP7o5mtNbMf5Tonm9nzZvabXOf3OddfA8zsUjP7Z05o+TszG5/LbzWzZ7OF9tMWzkUg\nAIQiCgRajSXAiWb2gpndaWafyOV3pJTOTSmdjRJNXl5dsyulNB/9xs9DwHUo88BV+QffQKf070wp\nnYnOilxb3zRbXjcDl+RULU8B1+frFwJnpZQ+gs7uBAItRSiiQKCFSCltR6lkrkapUx4ws6uAT5rZ\nk2a2EuU+O6u67OH8fyWwOqW0MVtUL6FEowDrUkpL8+v7UF6zGuejHypcambLga+g8zrb0HmbX5nZ\nFZQfjQsEWoaIEQUCLUZKaRB4AngiK55r0K++zk8prTOzH9L4C6D+m0JD1Wt/789w84HA5vcG/CWl\n9MXm/pjZeSgp6SL0UwMXH+KQAoEjQlhEgUALYWZnmNmsqugclDAUoCfHbRYdRtMzMxEClI3g702f\n/wv4uJmdlvvRZWan5/t1p5QeA75N46n7QKAlCIsoEGgtxgO3m9kkyi/uXo3y1q1CqWOWHUa7a4Dr\nzOxulFrmF/WHKaU3swvwfjMbm4tvRqmKHjKzTmQ1XX8Y9w4EjgiR4icQeJ/DzE4GHs1Eh0DgfYdw\nzQUCgUBgVBEWUSAQCARGFWERBQKBQGBUEYooEAgEAqOKUESBQCAQGFWEIgoEAoHAqCIUUSAQCARG\nFaGIAoFAIDCq+B9NZzvbe76SRgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fa8dd91a3d0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 统计长度分布\n",
    "fdist1 = FreqDist(wdcontent_len)\n",
    "print fdist1                        #<FreqDist with 19317 samples and 260819 outcomes>\n",
    "vocabulary1 = fdist1.keys()\n",
    "print vocabulary1[:10]\n",
    "fdist1[100]                     #906\n",
    "fdist1.plot(250,cumulative=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "** content 长度（词数）\n",
      "max = 2787， min = 1, mean = 80, median = 40\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAELCAYAAAD6AKALAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X+UXWV97/H3h4TSXAJBfnTuNFAGL9GuQGp0RogVeieC\nJbXUYAsYrpUgKWkv1B9LtCT1LsXryircFrhFBI2GS/ihQ4pySYGUYshofxjSBAOT8KMECddMYxDE\nQKjETvzeP/ZzyM4wP86e2efHzHxea+01+zxnP/t8zz4n55vnefbejyICMzOzshzU6ADMzGx8cWIx\nM7NSObGYmVmpnFjMzKxUTixmZlYqJxYzMyuVE4uZmZXKicXMzErlxGJmZqWa3OgAGuHoo4+Otra2\nEdV99dVXOfTQQ8sNqASOqxjHVYzjKq5ZYxtNXJs2bXohIo4ZdsOImHBLe3t7jNS6detGXLeWHFcx\njqsYx1Vcs8Y2mriAjVHFb6y7wszMrFROLGZmVionFjMzK5UTi5mZlcqJxczMSuXEYmZmpXJiMTOz\nUjmxmJlZqZxYzMysVE4sI9S25L5Gh2Bm1pScWMzMrFROLGZmVionFjMzK5UTi5mZlcqJxczMSuXE\nYmZmpXJiMTOzUtUlsUiaJOn7ku5Nj4+U9KCkp9PfN+W2XSppm6SnJJ2VK2+X1JOeu16SUvkhku5M\n5Q9LaqvHe4LsWhZfz2JmdqB6tVg+DjyRe7wEWBsRM4C16TGSZgILgJOAecCNkialOjcBlwAz0jIv\nlS8CXoqIE4HrgKtr+1bMzGwoNU8sko4Ffhf4Wq54PrAyra8EzsmVd0XE3oh4FtgGnCKpFTg8Itan\neZdv7Vensq+7gDMqrRkzM6u/erRY/jfwZ8AvcmUtEbEzrf8IaEnr04Ef5rbbkcqmp/X+5QfUiYg+\nYDdwVInxm5lZAZNruXNJZwPPR8QmSZ0DbRMRISlqGUeKZTGwGKClpYXu7u4R7WfPnj10d3dz+ay+\n18tGuq8yVeJqNo6rGMdVTLPGBc0bW13iioiaLcBfkLUutpO1TP4duB14CmhN27QCT6X1pcDSXP0H\ngHelbZ7MlV8AfCW/TVqfDLwAaKi42tvbY6TWrVsXERHHX3Hv60szqMTVbBxXMY6rmGaNK6J5YxtN\nXMDGqOK3v6ZdYRGxNCKOjYg2skH5hyLiD4HVwMK02ULgnrS+GliQzvQ6gWyQfkNk3WYvS5qTxk8u\n7Fensq9z02vUvAVkZmYDq2lX2BCuAlZJWgQ8B5wPEBFbJa0CHgf6gMsiYl+qcylwCzAFWJMWgBXA\nbZK2AT8hS2BmZtYgdUssEdENdKf1F4EzBtluGbBsgPKNwMkDlL8GnFdiqGZmNgq+8t7MzErlxGJm\nZqVyYjEzs1I5sZiZWamcWMzMrFROLGZmVionFjMzK5UTi5mZlcqJxczMSuXEYmZmpXJiMTOzUjmx\nmJlZqZxYzMysVE4sZmZWKicWMzMrVU0Ti6RflrRB0qOStkr6fCq/UlKvpM1peV+uzlJJ2yQ9Jems\nXHm7pJ703PVpJknSbJN3pvKHJbXV8j2ZmdnQat1i2Qu8JyLeBswG5kmak567LiJmp+V+AEkzyWaA\nPAmYB9woaVLa/ibgErLpimek5wEWAS9FxInAdcDVNX5PZmY2hFrPeR8RsSc9PDgtQ81HPx/oioi9\nEfEssA04RVIrcHhErE/z2d8KnJOrszKt3wWcUWnNmJlZ/dV8jEXSJEmbgeeBByPi4fTURyU9Julm\nSW9KZdOBH+aq70hl09N6//ID6kREH7AbOKomb8bMzIalrAFQhxeSjgDuBj4K/Bh4gaz18gWgNSIu\nlnQDsD4ibk91VgBrgO3AVRFxZio/HbgiIs6WtAWYFxE70nPPAKdGxAv9Xn8xsBigpaWlvaura0Tv\nY8+ePUydOpWe3t2vl82aPm1E+ypTJa5m47iKcVzFNGtc0LyxjSauuXPnboqIjmE3jIi6LcBngU/1\nK2sDtqT1pcDS3HMPAO8CWoEnc+UXAF/Jb5PWJ5MlLA0VR3t7e4zUunXrIiLi+CvufX1pBpW4mo3j\nKsZxFdOscUU0b2yjiQvYGFX81tf6rLBjUksFSVOA9wJPpjGTig8AW9L6amBBOtPrBLJB+g0RsRN4\nWdKcNH5yIXBPrs7CtH4u8FA6AGZm1gCTa7z/VmBlOrPrIGBVRNwr6TZJs8m6wrYDfwwQEVslrQIe\nB/qAyyJiX9rXpcAtwBSy7rE1qXwFcJukbcBPyM4qMzOzBqlpYomIx4C3D1D+4SHqLAOWDVC+ETh5\ngPLXgPNGF6mZmZXFV96bmVmpnFjMzKxUTiwlaFtyX6NDMDNrGk4sZmZWKicWMzMrlROLmZmVyonF\nzMxK5cRiZmalcmIxM7NSObGYmVmpnFhqyNe3mNlE5MRiZmalcmKpE7dezGyiqPVt8yc8JxQzm2jc\nYjEzs1JVlViUOa7WwYwHbqGY2URXVWJJU/3eX3Tnkn5Z0gZJj0raKunzqfxISQ9Kejr9fVOuzlJJ\n2yQ9JemsXHm7pJ703PVpimLSNMZ3pvKHJbUVjbNe2pbc58RjZuNeka6wRyS9s+D+9wLviYi3AbOB\neZLmAEuAtRExA1ibHiNpJtnUwicB84Ab07TGADcBlwAz0jIvlS8CXoqIE4HrgKsLxlh3Ti5mNp4V\nSSynAt+T9Iykx1Lr4bGhKkRmT3p4cFoCmA+sTOUrgXPS+nygKyL2RsSzwDbgFEmtwOERsT61nm7t\nV6eyr7uAMyqtGTMzqz9lv9NVbCgdP1B5RDw3TL1JwCbgROBLEXGFpJ9GxBHpeZG1OI6QdAOwPiJu\nT8+tANYA24GrIuLMVH46cEVEnC1pCzAvInak554BTo2IF/rFsRhYDNDS0tLe1dVV1fvub8+ePUyd\nOpWe3t0HlM+aPg2Ant7dB6xXq1JnpCpxNRvHVYzjKqZZ44LmjW00cc2dO3dTRHQMt13VpxtHxHOS\nTgNmRMT/kXQMMGx0EbEPmC3pCOBuSSf3ez4kVZfdRiEilgPLATo6OqKzs3NE++nu7qazs5OL+nVn\nbf9Qtr+Lltx3wHq1KnVGqhJXs3FcxTiuYpo1Lmje2OoRV9VdYZI+B1wBLE1FBwO3V1s/In4KrCMb\nG9mVurdIf59Pm/UC+bPPjk1lvWm9f/kBdSRNBqYBL1Ybl5mZlavIGMsHgPcDrwJExL8Bhw1VQdIx\nqaWCpCnAe4EngdXAwrTZQuCetL4aWJDO9DqBbJB+Q0TsBF6WNCd1nV3Yr05lX+cCD0W1/XtNxAP6\nZjZeFLny/uf5bitJh1ZRpxVYmcZZDgJWRcS9kr4HrJK0CHgOOB8gIrZKWgU8DvQBl6WuNIBLgVuA\nKWTjLmtS+QrgNknbgJ+QnVVmZmYNUiSxrJL0FeAISZcAFwNfHapCRDwGvH2A8heBMwapswxYNkD5\nRuDkAcpfA86r5g3Uy0hbH5V626/63TLDMTOrqyKD938l6b3Ay8BbgM9GxIM1i8zMzMakovcK6wH+\nAfhuWrca8HiLmY1lRc4K+yNgA/D7ZIPk6yVdXKvAJjrf/sXMxqoiYyyfBt6exkeQdBTwz8DNtQjM\nMvnk4rEXMxsLinSFvQi8knv8Cr5exMzM+hm2xSLpk2l1G/CwpHvYf7+vIe8VZuVqW3KfWy1m1vSq\n6QqrXAT5TFoq7hlgWzMzm+CGTSwR8fl6BGJmZuND1YP3kjqAzwDH5+tFxG/UIC4bhAfzzazZFTkr\n7A6yM8N6gF/UJhwzMxvriiSWH0fE6ppFYoXlWy+3zKvm1m1mZrVXJLF8TtLXyKYS3lspjIhvlR6V\nmZmNWUUSy0eAXyebh6XSFRaAE4uZmb2uSGJ5Z0S8tWaRmJnZuFDkyvt/ljSzZpGYmdm4UCSxzAE2\nS3pK0mOSeiQNeeW9pOMkrZP0uKStkj6eyq+U1Ctpc1rel6uzVNK29Dpn5crb02tuk3R9mkmSNNvk\nnan8YUltRQ6AmZmVq0hX2LwR7L8PuDwiHpF0GLBJUmUOl+si4q/yG6cW0QLgJOBXgW9LekuaRfIm\n4BLgYeD+FM8aYBHwUkScKGkBcDXwwRHEOqb19O7mIk8UZmZNoEiLJQZZBq8QsTMiHknrrwBPANOH\nqDIf6IqIvRHxLNn9yU6R1AocHhHr03z2twLn5OqsTOt3AWdUWjNmZlZ/RRLLfcC96e9a4Afsn3d+\nWKmL6u1kLQ6Aj6YutZslvSmVTQd+mKu2I5VNT+v9yw+oExF9wG7gqGrjMjOzcilrAIygovQO4NKI\n+KMqtp0KfAdYFhHfktQCvEDW4vkC0BoRF0u6AVgfEbeneivIktd24KqIODOVnw5cERFnS9oCzIuI\nHem5Z4BTI+KFfjEsBhYDtLS0tHd1dY3ofe/Zs4epU6fS07v7gPJZ06cBvKG8XlqmwK6fHRhLM6gc\nr2bjuIpxXMU1a2yjiWvu3LmbIqJjuO2KjLEcII2bnDrcdpIOBr4J3FG5mDIiduWe/ypZSwigFzgu\nV/3YVNab1vuX5+vskDQZmMYA88RExHJgOUBHR0d0dnYO/yYH0N3dTWdn5+vjGRXbP5Ttr395vVw+\nq49retLH2fNqFlMTjLVUjlezcVzFOK7imjW2esRV5CaUn8w9PAh4B/Bvw9QRsAJ4IiKuzZW3RsTO\n9PADwJa0vhr4uqRryQbvZwAbImKfpJclzSHrSrsQ+GKuzkLge2RTJj8UI22GmZnZqBVpsRyWW+8j\nG2v55jB13g18GOiRtDmV/TlwgaTZZF1h24E/BoiIrZJWAY+n17gsnREGcClwCzCFrHusMr6zArhN\n0jbgJ2RnlRmeGMzMGqPqxDKSeVki4h+Bgc7Qun+IOsuAZQOUbwROHqD8NeC8orGZmVltFOkKewvw\nKaCNA+djeU/5YY09bQ0aWxlOm69tMbM6K9IV9jfAl4GvAfuG2daajLvFzKxeiiSWvoi4qWaRmJnZ\nuFDkAsm/lXSppFZJR1aWmkVmZmZjUpEWy8L099O5sgDeXF44ZmY21hU5K+yEoZ6X9N6IeHCobayx\n8icYeLzFzGqlSFfYcK4ucV9mZjZGlZlYfEdhMzMrNbH4NipjSNuS+5r22hszG9vKTCxmZmalJpbt\nJe7LzMzGqEK3zZf0m7zxli63pr+/X2pkZmY2JhW5V9htwH8BNrP/li6VaYJtjPIpyGZWtiItlg5g\npuc6MTOzoRQZY9kC/OdaBWJmZuNDkRbL0cDjkjYAeyuFEfH+0qOyhvAt9s2sDEUSy5VFdy7pOLIx\nmBay8ZjlEfHX6eaVd5KdCLAdOD8iXkp1lgKLyMZxPhYRD6TydvbPIHk/8PGICEmHpNdoJ5vr/oMR\nsb1orGZmVo6qu8Ii4jvAk2RTFB9GNo/9d4ap1gdcHhEzgTnAZZJmAkuAtRExA1ibHpOeWwCcBMwD\nbpQ0Ke3rJuASYEZa5qXyRcBLEXEicB2+tYyZWUNVnVgknQ9sIJsG+HzgYUnnDlUnInZGxCNp/RXg\nCWA6MB9YmTZbCZyT1ucDXRGxNyKeBbYBp0hqBQ6PiPXp5IFb+9Wp7Osu4AxJvr3MKPiKfDMbDVV7\nkpekR4H3RsTz6fExwLcj4m1V1m8Dvks2b/3/i4gjUrnIWhxHSLoBWB8Rt6fnVgBryLrLroqIM1P5\n6cAVEXG2pC3AvIjYkZ57Bjg1Il7o9/qLgcUALS0t7V1dXVW97/727NnD1KlT6endPaL6tdIyBXb9\nrPz9zpo+bVT1K8er2TiuYhxXcc0a22jimjt37qaI6BhuuyJjLAdVkkryIlW2eCRNBb4JfCIiXs43\nKNI4Sc1PYY6I5cBygI6Ojujs7BzRfrq7u+ns7OSiJvtf/eWz+rimp9D1rlXZ/qHOUdWvHK9m47iK\ncVzFNWts9YiryC/R30l6APhGevxBskH0IUk6mCyp3BER30rFuyS1RsTO1M1VSVi9wHG56semst60\n3r88X2eHpMnANLKkZ2ZmDVBk8P7TZP/j/420LI+IK4aqk7q5VpAN9F+be2o1+2ekXAjckytfIOkQ\nSSeQDdJviIidwMuS5qR9XtivTmVf5wIP+SLO8ni8xcyKKtR3EhHfJGt9VOvdwIeBHkmbU9mfA1cB\nqyQtAp4jOxmAiNgqaRXwONkZZZdFROX2MZey/3TjNWmBLHHdJmkb8BOys8rMzKxBhk0skv4xIk6T\n9AoHzrkisiGSwwerGxH/yOATgJ0xSJ1lwLIByjeSDfz3L3+N7Ew1MzNrAsMmlog4Lf09rPbhWDPK\nX5Hvq/PNbDhFrmO5rZoyMzOb2IrchPKk/IN0BlZ7ueFYM/NAvplVY9jEImlpGl/5DUkvp+UVYBf7\nz8wyMzMDqkgsEfEXaXzlLyPi8LQcFhFHRcTSOsRoZmZjSJGusHslHQog6Q8lXSvp+BrFZU3O3WJm\nNpgiieUm4N8lvQ24HHgGT0tsZmb9FEksfemK9vnADRHxJbLb59sE1bbkPrdczOwNilx5/0qahOvD\nwOmSDgIOrk1YZmY2VhVJLB8E/htwcUT8SNKvAX9Zm7BsLMm3WnzhpJkVuQnlj8juE3ZIKnoBuLsW\nQZmZ2dhV5Mr7S8hmaPxKKpoO/N9aBGXjR0/vbo/DmE0wRQbvLyO7W/HLABHxNPArtQjKzMzGriKJ\nZW9E/LzyIN3SxfOe2AHcOjGzIonlO5L+HJgi6b3A3wB/W5uwbDxwkjGbmIokliXAj4Ee4I/JpiX+\nH0NVkHSzpOclbcmVXSmpV9LmtLwv99xSSdskPSXprFx5u6Se9Nz1aRZJ0kyTd6byhyW1FXg/ViO+\nvsVsYityVtgvIuKrEXFeRJyb1ofrCrsFmDdA+XURMTst9wNImkk2++NJqc6Nkial7W8CLiGbqnhG\nbp+LgJci4kTgOuDqat+PmZnVRpGzwp6V9IP+y1B1IuK7ZNMFV2M+0BUReyPiWWAbcIqkVuDwiFif\nEtmtwDm5OivT+l3AGZXWjDUXt2DMJo4iF0h25NZ/mWw64CNH+LoflXQhsBG4PCJeIjt9eX1umx2p\n7D/Sev9y0t8fAkREn6TdwFFk19iYmVkDaPjerCEqS5siYsjJvtK4x70RcXJ63EL2wx/AF4DWiLhY\n0g3A+oi4PW23AlgDbAeuiogzU/npwBURcXYau5kXETvSc88Ap0bEGxKLpMXAYoCWlpb2rq6uEb3n\nPXv2MHXqVHp6d4+ofq20TIFdP2t0FG+Uj2vW9GmNDSan8jk2G8dVTLPGBc0b22jimjt37qaI6Bhu\nu6pbLJLekXt4EFkLpkiLB4CI2JXb51eBe9PDXuC43KbHprLetN6/PF9nRzr9eRrw4iCvuxxYDtDR\n0RGdnZ1FQwegu7ubzs5OLmqyrp3LZ/VxTU/hj6Pm8nFt/1BnY4PJqXyOzcZxFdOscUHzxlaPuIr8\nEl3D/utW+shaEucVfUFJrRGxMz38AFA5Y2w18HVJ1wK/SjZIvyEi9qVZK+cADwMXAl/M1VkIfA84\nF3ioihMKzMyshooklnvJEktlcDyAsytj5RFxbf8Kkr4BdAJHS9oBfA7olDQ71d9OduoyEbFV0irg\ncbLEdVlE7Eu7upTsDLMpZN1ja1L5CuA2SdvIThJYUOD9WIO0LbnPN6s0G8eKJJZ24J1k89wL+D1g\nA/D0YBUi4oIBilcMsf0yYNkA5RuBkwcof40RtJqsMXxmmNnEUCSxHAu8IyJegexCR+C+iPjDWgRm\n41slybjlYjb+FLnyvgX4ee7xz1OZmZnZ64oklluBDemWLFeSDaTfUougbOJw95jZ+FN1V1hELJO0\nBjg9FX0kIr5fm7BsInG3mNn4UujCh4h4BHikRrGYmdk4UKQrzKym3C1mNj44sZiZWamcWKypeC4X\ns7HPicXMzErVfHctNOPA8RafLWY2trjFYmZmpXKLxZqeWy9mY4tbLGZmViq3WGxMcevFrPm5xWJm\nZqWqaWKRdLOk59Pc9JWyIyU9KOnp9PdNueeWStom6SlJZ+XK2yX1pOeuV5pdTNIhku5M5Q9Laqvl\n+zEzs+HVusVyCzCvX9kSYG1EzADWpsdImkk2A+RJqc6NkialOjcBl5BNVzwjt89FwEsRcSJwHXB1\nzd6JNR1fTGnWnGqaWCLiu2RTBufNB1am9ZXAObnyrojYGxHPAtuAUyS1AodHxPo0n/2t/epU9nUX\ncEalNWMTh5OLWXNpxBhLS0TsTOs/Yv9kYdOBH+a225HKpqf1/uUH1ImIPmA3cFRtwrZm5taLWfNo\n6FlhERGSoh6vJWkxsBigpaWF7u7uEe1nz549dHd3c/msvhKjG72WKTRdTFD/uL54xz3Mmj5t2O0q\nn2OzcVzFNGtc0Lyx1SOuRiSWXZJaI2Jn6uZ6PpX3Asfltjs2lfWm9f7l+To7JE0GpgEvDvSiEbEc\nWA7Q0dERnZ2dIwq+u7ubzs5OLmqy/x1fPquPa3qa7+zxRsS1/UOdw25T+RybjeMqplnjguaNrR5x\nNaIrbDWwMK0vBO7JlS9IZ3qdQDZIvyF1m70saU4aP7mwX53Kvs4FHkrjMDaB5bvF3EVmVn81/a+k\npG8AncDRknYAnwOuAlZJWgQ8B5wPEBFbJa0CHgf6gMsiYl/a1aVkZ5hNAdakBWAFcJukbWQnCSyo\n5fuxscUJxawxappYIuKCQZ46Y5DtlwHLBijfCJw8QPlrwHmjidHMzMrlK+9tQnDrxax+nFhswvB4\ni1l9OLHYhOPkYlZbTiw2IfX07nYLxqxGnFjMzKxUzXdFnVmdeY4Xs3K5xWKW4+4xs9FzYjEzs1K5\nK8xsAO4eMxs5t1jMhuHuMbNi3GIxq1L/5OKWjNnA3GIxGyG3ZMwG5sRiNkpOLmYHcleYWQk82G+2\nnxOLWcmcZGyic1eYmZmVqmEtFknbgVeAfUBfRHRIOhK4E2gDtgPnR8RLafulwKK0/cci4oFU3s7+\n2SXvBz7u6YmtWQw0/uJWjI13jW6xzI2I2RHRkR4vAdZGxAxgbXqMpJlk0w6fBMwDbpQ0KdW5CbgE\nmJGWeXWM36ywytlkHvS38arZxljmA51pfSXQDVyRyrsiYi/wbJrj/pTU6jk8ItYDSLoVOAdYU9+w\nzUYun2BumXdoAyMxK4ca1Wsk6VlgN1nX1lciYrmkn0bEEel5AS9FxBGSbgDWR8Tt6bkVZMljO3BV\nRJyZyk8HroiIswd4vcXAYoCWlpb2rq6uEcW9Z88epk6dSk/v7hHVr5WWKbDrZ42O4o0cVzH5uGZN\nn9bYYHIq3/tm06xxQfPGNpq45s6duynXwzSoRrZYTouIXkm/Ajwo6cn8kxERkkrLehGxHFgO0NHR\nEZ2dnSPaT3d3N52dnVzUZN0Yl8/q45qeZmuAOq6iDoir51UgG5Np9FX/le99s2nWuKB5Y6tHXA0b\nY4mI3vT3eeBu4BRgl6RWgPT3+bR5L3Bcrvqxqaw3rfcvNxs3PBZjY01DEoukQyUdVlkHfhvYAqwG\nFqbNFgL3pPXVwAJJh0g6gWyQfkNE7AReljQndZ1dmKtjNm75BABrZo3qC2gB7s5yAZOBr0fE30n6\nF2CVpEXAc8D5ABGxVdIq4HGgD7gsIvalfV3K/tON1+CBe5tgBksuPq3ZGqUhiSUifgC8bYDyF4Ez\nBqmzDFg2QPlG4OSyYzQb69qW3Pd6csmvm9Vao69jMbMaGqi7zN1nVmvNd1qMmdVEPqEMdkcA3+fM\nyuAWi5kBb0w2+dZOT+9ut3Ssam6xmNmQ2pbcx+Wz9q/355aN9efEYmaj4rPSRqdy/Gp9vCqvU4/b\nBjmxmFlNDNV15qQzvjmxmFndVTNeM9LkM1ALwKdb15cTi5k1peGSz3CJwicbNI4Ti5mNSdlJBX1V\n3xB2oFOp3ZKpDScWM5twhrumJy9/fc9YSELNcC2SE4uZ2RCqTUJFuubyLaZqthlo3/1v2dNMnFjM\nzErQ/8d9qG66gRJBtbfeKdLaalTCcWIxM2tyzdYiGY5v6WJmZqVyYjEzs1KNi8QiaZ6kpyRtk7Sk\n0fGYmU1kYz6xSJoEfAn4HWAmcIGkmY2Nysxs4hrziQU4BdgWET+IiJ8DXcD8BsdkZjZhjYfEMh34\nYe7xjlRmZmYNoIhodAyjIulcYF5E/FF6/GHg1Ij4037bLQYWp4dvBZ4a4UseDbwwwrq15LiKcVzF\nOK7imjW20cR1fEQcM9xG4+E6ll7guNzjY1PZASJiObB8tC8maWNEdIx2P2VzXMU4rmIcV3HNGls9\n4hoPXWH/AsyQdIKkXwIWAKsbHJOZ2YQ15lssEdEn6U+BB4BJwM0RsbXBYZmZTVhjPrEARMT9wP11\nerlRd6fViOMqxnEV47iKa9bYah7XmB+8NzOz5jIexljMzKyJOLEU0Cy3jpF0nKR1kh6XtFXSx1P5\nlZJ6JW1Oy/saENt2ST3p9TemsiMlPSjp6fT3TXWO6a25Y7JZ0suSPtGI4yXpZknPS9qSKxv0+Eha\nmr5vT0k6q85x/aWkJyU9JuluSUek8jZJP8sdty/XOa5BP7cGH687czFtl7Q5ldfzeA3221Df71hE\neKliITsx4BngzcAvAY8CMxsUSyvwjrR+GPCvZLezuRL4VIOP03bg6H5l/wtYktaXAFc3+HP8EXB8\nI44X8FvAO4Atwx2f9Jk+ChwCnJC+f5PqGNdvA5PT+tW5uNry2zXgeA34uTX6ePV7/hrgsw04XoP9\nNtT1O+YWS/Wa5tYxEbEzIh5J668AT9DcdxuYD6xM6yuBcxoYyxnAMxHxXCNePCK+C/ykX/Fgx2c+\n0BUReyPiWWAb2fewLnFFxN9HRF96uJ7sGrG6GuR4Daahx6tCkoDzgW/U4rWHMsRvQ12/Y04s1WvK\nW8dIagPeDjycij6aui5urneXUxLAtyVtSnc7AGiJiJ1p/UdASwPiqljAgf/gG328YPDj00zfuYuB\nNbnHJ6Rune9IOr0B8Qz0uTXL8Tod2BURT+fK6n68+v021PU75sQyhkmaCnwT+EREvAzcRNZVNxvY\nSdYcr7fTImI22d2mL5P0W/knI2t/N+RURGUX0L4f+JtU1AzH6wCNPD6DkfQZoA+4IxXtBH4tfc6f\nBL4u6fAM5opeAAAEsklEQVQ6htR0n1s/F3Dgf17qfrwG+G14XT2+Y04s1avq1jH1Iulgsi/OHRHx\nLYCI2BUR+yLiF8BXqVE3wFAiojf9fR64O8WwS1JrirsVeL7ecSW/AzwSEbtSjA0/Xslgx6fh3zlJ\nFwFnAx9KP0ikbpMX0/omsn75t9QrpiE+t2Y4XpOB3wfurJTV+3gN9NtAnb9jTizVa5pbx6Q+3BXA\nExFxba68NbfZB4At/evWOK5DJR1WWScb/N1CdpwWps0WAvfUM66cA/4n2ejjlTPY8VkNLJB0iKQT\ngBnAhnoFJWke8GfA+yPi33PlxyibBwlJb05x/aCOcQ32uTX0eCVnAk9GxI5KQT2P12C/DdT7O1aP\nMxXGywK8j+wsi2eAzzQwjtPImrKPAZvT8j7gNqAnla8GWusc15vJzjB5FNhaOUbAUcBa4Gng28CR\nDThmhwIvAtNyZXU/XmSJbSfwH2T92YuGOj7AZ9L37Sngd+oc1zay/vfKd+zLads/SJ/vZuAR4Pfq\nHNegn1sjj1cqvwX4k37b1vN4DfbbUNfvmK+8NzOzUrkrzMzMSuXEYmZmpXJiMTOzUjmxmJlZqZxY\nzMysVE4sZmZWKicWsypJukjSDTXY56+Oov5sDXO7/1rEbTYUJxazxroIGHFiIbtfVt3n3TEbihOL\nTWiSPi3pY2n9OkkPpfX3SLpD0kck/aukDcC7c/ValE1+9WhafjOVf1LSlrR8IpW1SXpC0lfT5Et/\nL2mKpHOBDuCOdOfbKZLa0x1wN0l6IHd/p25JV0vakOI5Pd1a6H8CH0z1P1jF+z1G0jcl/Uta3p3K\nr0x3Cu6W9IPKMTEbCScWm+j+gew255D9yE9NN/E7nez2PZ8nSyinkU2KVHE98J2IeBvZhE9bJbUD\nHwFOBeYAl0h6e9p+BvCliDgJ+CnwBxFxF7CR7AaPs8nuIPxF4NyIaAduBpblXnNyRJwCfAL4XGTz\nAn0WuDMiZkfEnQzvr4HrIuKdZLca+VruuV8HziK7qePn0nEwK2xyowMwa7BNQHu6jflesns5dZAl\nloeA7oj4MWRTz7L/rrTvAS4EiIh9wG5JpwF3R8Sraftvpf2sBp6NiM2512wbIJa3AicDD2b3EmQS\n2f2oKip3qh2sfjXOBGam/QMcnm6xDnBfROwF9kp6nmzOjh0D7MNsSE4sNqFFxH9IepZsrOOfyW7e\nNxc4EfgS2f/iy7A3t74PmDLANgK2RsS7htnHPkb+b/cgYE5EvHbAC2eJpn+M/n2wEXFXmFnWHfYp\n4Ltp/U+A75NNx/tfJR2VuoXOy9VZC/x3AEmTJE1Ldc+R9J/StAEfSGVDeYVsbnLI7i57jKR3pf0e\nLOmkAvWr8ffARysPJM0uUNesKk4sZtmPfyvwvcgmAXsN+IfIpnK9Evge8E9k84dXfByYK6mHrGtq\nZmRzjd9CNp/Fw8DXIuL7w7z2LcCXJW0m6/o6F7ha0qNktzz/zWHqryPr2qpq8B74GNChbFrfx8mS\nqFmpfNt8MzMrlVssZmZWKg/OmY0Tkj5C1kWX908RcVkj4rGJy11hZmZWKneFmZlZqZxYzMysVE4s\nZmZWKicWMzMrlROLmZmV6v8D/gnHVhPWx18AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fb1162457d0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "wdcontent_len = df_train_content.wdcontent_len.values\n",
    "max_len = max(wdcontent_len)\n",
    "min_len = min(wdcontent_len)\n",
    "mean_len = np.mean(wdcontent_len)\n",
    "median_len = np.median(wdcontent_len)\n",
    "print '** content 长度（词数）'\n",
    "print 'max = %d， min = %d, mean = %d, median = %d' % (max_len, min_len, mean_len, median_len)\n",
    "\n",
    "plt.hist(wdcontent_len, bins = 200, range=(0, 200))\n",
    "plt.xlabel('wdcontent_len')\n",
    "plt.ylabel('question_number')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## baseline\n",
    "**先使用 title 数据来做一各 baseline 模型。**\n",
    "\n",
    "使用 attention-bigru， max_len 取值为 20.\n",
    "\n",
    "**使用 content 数据训练 baseline2**\n",
    "\n",
    "使用 attention-bigru，max_len 取值 100\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
